In [7]:
import os
import pandas as pd
import numpy as np

# Define directory paths
raw_dir = os.path.join('data', 'raw')  # Define raw_dir before using it
processed_dir = os.path.join('data', 'processed')  # Define processed_dir before using it

# Create folders if they don't exist
try:
    os.makedirs(raw_dir, exist_ok=True)
    os.makedirs(processed_dir, exist_ok=True)
except PermissionError:
    # Alternative: use a directory where you have write permissions
    raw_dir = os.path.join(os.path.expanduser('~'), 'data', 'raw')
    processed_dir = os.path.join(os.path.expanduser('~'), 'data', 'processed')
    os.makedirs(raw_dir, exist_ok=True)
    os.makedirs(processed_dir, exist_ok=True)
    print(f"Using alternative directory: {os.path.dirname(raw_dir)}")

# Define the sample data
data = {
    'age': [34, 45, 29, 50, 38, np.nan, 41],
    'income': [55000, np.nan, 42000, 58000, np.nan, np.nan, 49000],
    'score': [0.82, 0.91, np.nan, 0.76, 0.88, 0.65, 0.79],
    'zipcode': ['90210', '10001', '60614', '94103', '73301', '12345', '94105'],
    'city': ['Beverly', 'New York', 'Chicago', 'SF', 'Austin', 'Unknown', 'San Francisco'],
    'extra_data': [np.nan, 42, np.nan, np.nan, np.nan, 5, np.nan]
}

# Create DataFrame
df = pd.DataFrame(data)

# Save to CSV in raw data folder
csv_path = os.path.join(raw_dir, 'sample_data.csv')
if not os.path.exists(csv_path):
    df.to_csv(csv_path, index=False)
    print(f'Sample dataset created and saved to {csv_path}')
else:
    print(f'File already exists at {csv_path}. Skipping CSV creation to avoid overwrite.')

File already exists at data\raw\sample_data.csv. Skipping CSV creation to avoid overwrite.


In [16]:
print("Current working directory:", os.getcwd())
# Fill missing with median
df['income'] = df['income'].fillna(df['income'].median())
df['age'] = df['age'].fillna(df['age'].median())


# Alternative strategies
df_fill_mean = df.fillna(df.mean(numeric_only=True))
df_fill_median = df.fillna(df.median(numeric_only=True))
df_fill_ffill = df.fillna(method='ffill')

# Threshold-based row drop example
df_drop_thresh = df.dropna(thresh=int(0.5*df.shape[1]))

Current working directory: C:\Users\prabh


  df_fill_ffill = df.fillna(method='ffill')


In [19]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler
# MinMax scaling for numeric_col
scaler = MinMaxScaler()
df['numeric scaled'] = scaler.fit_transform(df[['income']])

# StandardScaler
standardizer = StandardScaler()
df['numeric_standard'] = standardizer.fit_transform(df[['income']])

In [20]:
print(df)

    age   income  score zipcode           city  extra_data  numeric scaled  \
0  34.0  55000.0   0.82   90210        Beverly         NaN          0.8125   
1  45.0  52000.0   0.91   10001       New York        42.0          0.6250   
2  29.0  42000.0    NaN   60614        Chicago         NaN          0.0000   
3  50.0  58000.0   0.76   94103             SF         NaN          1.0000   
4  38.0  52000.0   0.88   73301         Austin         NaN          0.6250   
5  39.5  52000.0   0.65   12345        Unknown         5.0          0.6250   
6  41.0  49000.0   0.79   94105  San Francisco         NaN          0.4375   

   numeric_standard  
0          0.767146  
1          0.122743  
2         -2.025264  
3          1.411548  
4          0.122743  
5          0.122743  
6         -0.521659  


In [21]:
def fill_missing_median(df, columns=None):
    df_copy = df.copy()
    if columns is None:
        columns = df.select_dtypes(include=np.number).columns
    for col in columns:
        df_copy[col] = df_copy[col].fillna(df_copy[col].median())
    return df_copy

def drop_missing(df, columns=None, threshold=None):
    df_copy = df.copy()
    if columns is not None:
        return df_copy.dropna(subset=columns)
    if threshold is not None:
        return df_copy.dropna(thresh=int(threshold*df_copy.shape[1]))
    return df_copy.dropna()

def normalize_data(df, columns=None, method='minmax'):
    df_copy = df.copy()
    if columns is None:
        columns = df_copy.select_dtypes(include=np.number).columns
    if method=='minmax':
        scaler = MinMaxScaler()
    else:
        scaler = StandardScaler()
    df_copy[columns] = scaler.fit_transform(df_copy[columns])
    return df_copy

def correct_column_types(df):
    df_copy = df.copy()
    if 'price' in df_copy.columns:
        df_copy['price'] = df_copy['price'].str.replace('$','').astype(float)
    if 'date_str' in df_copy.columns:
        df_copy['date'] = pd.to_datetime(df_copy['date_str'], errors='coerce')
    if 'category' in df_copy.columns:
        df_copy['category'] = df_copy['category'].str.lower().astype('category')
    return df_copy