In [9]:
import pandas as pd 
import numpy as np
import os
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split


In [2]:
output_directory = 'D:/Informatyka i ekonometria/praca magisterska/expresso_churn/output/'
path = 'D:/Informatyka i ekonometria/praca magisterska/expresso_churn/input/'
os.chdir(path)

df = pd.read_csv('Train.csv')

In [3]:
numeric_variables = ['MONTANT', 'FREQUENCE_RECH', 'REVENUE', 'ARPU_SEGMENT',
                     'FREQUENCE', 'DATA_VOLUME', 'ON_NET', 'ORANGE', 'TIGO',
                     'ZONE1', 'ZONE2', 'REGULARITY', 'FREQ_TOP_PACK']

categorical_variables = ['REGION', 'TOP_PACK', 'TENURE', 'MRG']

In [4]:
def preprocess_data(df):
    for column in df.select_dtypes(include=[np.number]).columns: #replace numeric NaN with mean
        df[column].fillna(df[column].mean(), inplace=True)

    for column in df.select_dtypes(include=["object"]).columns: #replace categories with most popular values 
        df[column].fillna(df[column].mode().iloc[0], inplace=True)

    # Check duplicates
    df = df.drop_duplicates()
    
    # Drop the 'user_id' column, if it exists
    df = df.drop(['user_id'], axis=1, errors='ignore')

    #Label encoding
    #le = LabelEncoder()
    #df.MRG = le.fit_transform(df.MRG)
    #df.TENURE = le.fit_transform(df.TENURE)
    #df.REGION = le.fit_transform(df.REGION)
    #df.TOP_PACK = le.fit_transform(df.TOP_PACK)
    
    
    return df

df = preprocess_data(df)

In [5]:
dummy_df = pd.get_dummies(df, columns=categorical_variables, drop_first = True)
dummy_names = list(dummy_df.columns)
dummy_names.remove('CHURN')

In [6]:
df = dummy_df

In [7]:
def replace_outliers_with_third_quartile(column):
    q3 = column.quantile(0.75)
    iqr = q3 - column.quantile(0.25)
    upper_bound = q3 + 1.5 * iqr
    return column.apply(lambda x: q3 if x > upper_bound else x)

for col in numeric_variables:
    df[col] = replace_outliers_with_third_quartile(df[col])
    
def replace_outliers_with_first_quartile(column):
    q1 = column.quantile(0.25)
    iqr = column.quantile(0.75) - q1
    lower_bound = q1 - 1.5 * iqr
    return column.apply(lambda x: q1 if x < lower_bound else x)

for col in numeric_variables:
    df[col] = replace_outliers_with_first_quartile(df[col])


In [10]:
y = df['CHURN']
X = df.drop(['CHURN'], axis=1, errors='ignore')

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=2137)

In [11]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PowerTransformer, KBinsDiscretizer

def apply_all_transformations(X_train, X_test, X_train_dummies, X_test_dummies):
    '''
    input df: X_train, X_test
    making transformation: standarization, scaling, log transformation etc. 
    return transformed train and test set
    '''

    # Min-Max Scaling
   # min_max_scaler = MinMaxScaler()
   # X_train_scaled = min_max_scaler.fit_transform(X_train)
   # X_test_scaled = min_max_scaler.transform(X_test)

    # Log Transformation
    X_train_log = np.log1p(X_train)
    X_test_log = np.log1p(X_test)

    # sqrt Transformation
    X_train_sqrt = np.sqrt(X_train)
    X_test_sqrt = np.sqrt(X_test)
    
    # pow Transformation
    X_train_pow = np.power(X_train,2)
    X_test_pow = np.power(X_test, 2)
    
    
    
    # Power Transformation (using Yeo-Johnson to handle non-positive data)
    power_transformer = PowerTransformer(method='yeo-johnson')
    X_train_power = power_transformer.fit_transform(X_train)
    X_test_power = power_transformer.transform(X_test)

    # Concatenate all transformed features along columns with appropriate suffixes
    X_train_combined = np.hstack([
       # X_train_scaled,
        X_train_log,
        X_train_power,
        X_train_sqrt,
        X_train_pow
    ])
    X_test_combined = np.hstack([
       # X_test_scaled,
        X_test_log,
        X_test_power,
        X_test_sqrt,
        X_test_pow
    ])

    # Create column names with appropriate suffixes
    col_names = []
    for col in X_train.columns:
        col_names.extend([
          #  f"{col}_scaled",
            f"{col}_log",
            f"{col}_power",
            f'{col}_sqrt',
            f'{col}_pow_2'
        ])
    
    col_names = col_names + list(X_train_dummies.columns)
    X_train_combined_with_dummies = np.hstack([
        X_train_combined,
        X_train_dummies
    ])
    X_test_combined_with_dummies = np.hstack([
        X_test_combined,
        X_test_dummies
    ])
    # Convert back to DataFrames with new column names
    X_train_combined_df = pd.DataFrame(X_train_combined_with_dummies, columns=col_names)
    X_test_combined_df = pd.DataFrame(X_test_combined_with_dummies, columns=col_names)

    return X_train_combined_df, X_test_combined_df

In [12]:
X_train, X_test = apply_all_transformations(X_train[numeric_variables], X_test[numeric_variables], X_train[dummy_names], X_test[dummy_names])


In [13]:
scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train)
X_test_std = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_std, columns=X_train.columns)
X_test = pd.DataFrame(X_test_std, columns=X_test.columns)

In [14]:
X_train.to_csv('X_train_after_transformations.csv', index = False) 
X_test.to_csv('X_test_after_transformations.csv', index = False) 
y_train.to_csv('y_train.csv', index = False)
y_test.to_csv('y_test.csv', index = False)