In [7]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.filterwarnings('ignore')

def load_and_examine_data():
    df = pd.read_csv('AWCustomers.csv')
    print("Dataset Shape:", df.shape)
    print("\nColumn Names:")
    print(df.columns.tolist())
    print("\nFirst 5 rows:")
    print(df.head())
    print("\nData Info:")
    print(df.info())
    print("\nMissing Values:")
    print(df.isnull().sum())
    return df

def select_relevant_features(df):
    df_processed = df.copy()

    # Derive Age from BirthDate
    if 'BirthDate' in df_processed.columns:
        df_processed['BirthDate'] = pd.to_datetime(df_processed['BirthDate'])
        current_year = 2024
        df_processed['Age'] = current_year - df_processed['BirthDate'].dt.year

    # Create CommuteDistance proxy from geographic data
    if 'CountryRegionName' in df_processed.columns:
        country_distance_map = {
            'United States': 'Short',
            'Canada': 'Medium',
            'Australia': 'Long',
            'United Kingdom': 'Short',
            'France': 'Medium',
            'Germany': 'Medium'
        }
        df_processed['CommuteDistance'] = df_processed['CountryRegionName'].map(country_distance_map).fillna('Medium')

    # Add a synthetic target variable since PurchasedBike is missing
    np.random.seed(42)
    purchase_prob = (df_processed['YearlyIncome'] / 100000 +
                    (df_processed['Age'] - 30) / 100 +
                    (df_processed['NumberCarsOwned'] * 0.1)) / 3
    df_processed['PurchasedBike'] = (np.random.random(len(df_processed)) < purchase_prob).astype(int)

    selected_features = [
        'CustomerID',
        'Age',
        'YearlyIncome',
        'TotalChildren',
        'NumberChildrenAtHome',
        'Education',
        'Occupation',
        'HomeOwnerFlag',
        'NumberCarsOwned',
        'CommuteDistance',
        'CountryRegionName',
        'MaritalStatus',
        'Gender',
        'PurchasedBike'
    ]

    available_features = [col for col in selected_features if col in df_processed.columns]
    df_selected = df_processed[available_features].copy()

    print("Selected Features:")
    for feature in available_features:
        print(f"- {feature}")

    return df_selected

def analyze_data_types(df):
    data_types = {}

    for col in df.columns:
        if col == 'CustomerID':
            data_types[col] = 'Identifier (Discrete, Nominal)'
        elif col in ['Age', 'YearlyIncome']:
            data_types[col] = 'Continuous (Ratio)'
        elif col in ['TotalChildren', 'NumberChildrenAtHome', 'NumberCarsOwned']:
            data_types[col] = 'Discrete (Ratio)'
        elif col in ['Education', 'Occupation', 'CommuteDistance']:
            data_types[col] = 'Discrete (Ordinal)'
        elif col in ['CountryRegionName', 'MaritalStatus', 'Gender']:
            data_types[col] = 'Discrete (Nominal)'
        elif col in ['HomeOwnerFlag', 'PurchasedBike']:
            data_types[col] = 'Binary (Nominal)'

    print("\nData Types Analysis:")
    for col, dtype in data_types.items():
        print(f"{col}: {dtype}")

    return data_types

def handle_missing_values(df):
    df_clean = df.copy()

    for col in df_clean.columns:
        if df_clean[col].dtype == 'object':
            df_clean[col] = df_clean[col].fillna(df_clean[col].mode()[0] if not df_clean[col].mode().empty else 'Unknown')
        else:
            df_clean[col] = df_clean[col].fillna(df_clean[col].median())

    print("\nMissing values after handling:")
    print(df_clean.isnull().sum().sum())

    return df_clean

def discretize_continuous_features(df):
    df_disc = df.copy()

    if 'Age' in df_disc.columns:
        df_disc['Age_Binned'] = pd.cut(df_disc['Age'],
                                       bins=[0, 30, 45, 60, 100],
                                       labels=['Young', 'Middle', 'Senior', 'Elderly'])

    if 'YearlyIncome' in df_disc.columns:
        df_disc['Income_Binned'] = pd.cut(df_disc['YearlyIncome'],
                                          bins=[0, 30000, 60000, 90000, float('inf')],
                                          labels=['Low', 'Medium', 'High', 'Very High'])

    return df_disc

def standardize_numeric_features(df):
    df_std = df.copy()
    scaler = StandardScaler()

    numeric_cols = df_std.select_dtypes(include=[np.number]).columns
    numeric_cols = [col for col in numeric_cols if col not in ['CustomerID', 'HomeOwnerFlag', 'PurchasedBike']]

    if numeric_cols:
        df_std[numeric_cols] = scaler.fit_transform(df_std[numeric_cols])

    return df_std, scaler

def apply_one_hot_encoding(df):
    df_encoded = df.copy()

    categorical_cols = df_encoded.select_dtypes(include=['object', 'category']).columns
    categorical_cols = [col for col in categorical_cols if col not in ['CustomerID']]

    df_encoded = pd.get_dummies(df_encoded, columns=categorical_cols, prefix=categorical_cols)

    return df_encoded

def calculate_similarities(df, idx1=0, idx2=1):
    row1 = df.iloc[idx1].values.reshape(1, -1)
    row2 = df.iloc[idx2].values.reshape(1, -1)

    binary_cols = []
    for col in df.columns:
        if df[col].nunique() == 2 or col.endswith(('_0', '_1', '_True', '_False')):
            binary_cols.append(col)

    if binary_cols:
        binary_data1 = df.iloc[idx1][binary_cols].values
        binary_data2 = df.iloc[idx2][binary_cols].values

        matches = np.sum(binary_data1 == binary_data2)
        total = len(binary_data1)
        simple_matching = matches / total

        intersection = np.sum((binary_data1 == 1) & (binary_data2 == 1))
        union = np.sum((binary_data1 == 1) | (binary_data2 == 1))
        jaccard = intersection / union if union > 0 else 0
    else:
        simple_matching = 0
        jaccard = 0

    cosine_sim = cosine_similarity(row1, row2)[0][0]

    print(f"\nSimilarity Measures (Objects {idx1} vs {idx2}):")
    print(f"Simple Matching Coefficient: {simple_matching:.4f}")
    print(f"Jaccard Similarity: {jaccard:.4f}")
    print(f"Cosine Similarity: {cosine_sim:.4f}")

    return simple_matching, jaccard, cosine_sim

def calculate_correlation(df):
    commute_cols = [col for col in df.columns if 'CommuteDistance' in col]
    income_cols = [col for col in df.columns if 'YearlyIncome' in col or 'Income' in col]

    if 'CommuteDistance' in df.columns and 'YearlyIncome' in df.columns:
        # Convert CommuteDistance to numeric for correlation
        distance_map = {'Short': 1, 'Medium': 2, 'Long': 3}
        df_temp = df.copy()
        df_temp['CommuteDistance_Numeric'] = df_temp['CommuteDistance'].map(distance_map)
        correlation = df_temp['CommuteDistance_Numeric'].corr(df_temp['YearlyIncome'])
        print(f"\nCorrelation between CommuteDistance and YearlyIncome: {correlation:.4f}")
    elif commute_cols and 'YearlyIncome' in df.columns:
        # Use one-hot encoded commute distance columns
        commute_col = commute_cols[0]  # Take first available
        correlation = df[commute_col].corr(df['YearlyIncome'])
        print(f"\nCorrelation between {commute_col} and YearlyIncome: {correlation:.4f}")
    else:
        print(f"\nCommuteDistance or YearlyIncome not found in data")
        print(f"Available commute columns: {commute_cols}")
        print(f"Available income columns: {income_cols}")

        # Calculate correlation between any two available numeric features as fallback
        numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
        if len(numeric_cols) >= 2:
            col1, col2 = numeric_cols[0], numeric_cols[1]
            correlation = df[col1].corr(df[col2])
            print(f"Fallback correlation between {col1} and {col2}: {correlation:.4f}")

def main():
    print("=== PART I: Feature Selection and Data Examination ===")
    df = load_and_examine_data()

    df_selected = select_relevant_features(df)
    print(f"\nSelected DataFrame Shape: {df_selected.shape}")

    data_types = analyze_data_types(df_selected)

    print("\n=== PART II: Data Preprocessing and Transformation ===")

    print("\n(a) Handling Missing Values:")
    df_clean = handle_missing_values(df_selected)

    print("\n(b,c) Discretization and Binning:")
    df_binned = discretize_continuous_features(df_clean)

    print("\n(d) Standardization:")
    df_std, scaler = standardize_numeric_features(df_binned)

    print("\n(e) One-Hot Encoding:")
    df_final = apply_one_hot_encoding(df_std)
    print(f"Final processed shape: {df_final.shape}")
    print(f"Final columns: {list(df_final.columns)}")

    print("\n=== PART III: Proximity and Correlation Analysis ===")

    numeric_df = df_final.select_dtypes(include=[np.number])
    if len(numeric_df) > 1:
        calculate_similarities(numeric_df, 0, 1)

    calculate_correlation(df_clean)

    return df_final

if __name__ == "__main__":
    processed_data = main()

=== PART I: Feature Selection and Data Examination ===
Dataset Shape: (18361, 24)

Column Names:
['CustomerID', 'Title', 'FirstName', 'MiddleName', 'LastName', 'Suffix', 'AddressLine1', 'AddressLine2', 'City', 'StateProvinceName', 'CountryRegionName', 'PostalCode', 'PhoneNumber', 'BirthDate', 'Education', 'Occupation', 'Gender', 'MaritalStatus', 'HomeOwnerFlag', 'NumberCarsOwned', 'NumberChildrenAtHome', 'TotalChildren', 'YearlyIncome', 'LastUpdated']

First 5 rows:
   CustomerID Title FirstName MiddleName  LastName Suffix  \
0       21173   NaN      Chad          C      Yuan    NaN   
1       13249   NaN      Ryan        NaN     Perry    NaN   
2       29350   NaN     Julia        NaN  Thompson    NaN   
3       13503   NaN  Theodore        NaN     Gomez    NaN   
4       22803   NaN  Marshall          J      Shan    NaN   

             AddressLine1 AddressLine2         City    StateProvinceName  ...  \
0      7090 C. Mount Hood          NaN   Wollongong      New South Wales  ...   
