## Importing required packages

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from scipy.sparse import csr_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC



## Reading the original data

In [2]:
org_data=pd.read_csv('/home/arnabmukher/Desktop/Sem 1/PDS/tourn_1_calibration_csv.csv')
org_data.head()

Unnamed: 0,rev_Mean,mou_Mean,totmrc_Mean,da_Mean,ovrmou_Mean,ovrrev_Mean,vceovr_Mean,datovr_Mean,roam_Mean,rev_Range,...,kid0_2,kid3_5,kid6_10,kid11_15,kid16_17,creditcd,car_buy,retdays,eqpdays,Customer_ID
0,23.9975,219.25,22.5,0.2475,0.0,0.0,0.0,0.0,0.0,25.99,...,U,U,U,U,U,Y,New,23.0,361.0,1000001
1,57.4925,482.75,37.425,0.2475,22.75,9.1,9.1,0.0,0.0,153.14,...,U,U,U,U,U,Y,UNKNOWN,,240.0,1000002
2,16.99,10.25,16.99,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,U,Y,U,U,U,Y,New,,1504.0,1000003
3,38.0,7.5,38.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,Y,U,U,U,U,Y,New,,1812.0,1000004
4,55.23,570.5,71.98,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,U,U,U,U,U,Y,New,,434.0,1000005


## Data Preprocessing

#### Data cleaning by removing columns with missing values more than 50%

In [22]:
NA_percentages=org_data.isna().mean()*100
columns_to_be_dropped=NA_percentages[NA_percentages.values>50].index.to_list()
cleaned_data=org_data.drop(columns_to_be_dropped, axis=1)
cleaned_data.head()

Unnamed: 0,rev_Mean,mou_Mean,totmrc_Mean,da_Mean,ovrmou_Mean,ovrrev_Mean,vceovr_Mean,datovr_Mean,roam_Mean,rev_Range,...,ethnic,kid0_2,kid3_5,kid6_10,kid11_15,kid16_17,creditcd,car_buy,eqpdays,Customer_ID
0,23.9975,219.25,22.5,0.2475,0.0,0.0,0.0,0.0,0.0,25.99,...,N,U,U,U,U,U,Y,New,361.0,1000001
1,57.4925,482.75,37.425,0.2475,22.75,9.1,9.1,0.0,0.0,153.14,...,Z,U,U,U,U,U,Y,UNKNOWN,240.0,1000002
2,16.99,10.25,16.99,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,N,U,Y,U,U,U,Y,New,1504.0,1000003
3,38.0,7.5,38.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,U,Y,U,U,U,U,Y,New,1812.0,1000004
4,55.23,570.5,71.98,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,I,U,U,U,U,U,Y,New,434.0,1000005


#### Finding number of empty cells for each categorical column

In [4]:
categorical_columns = cleaned_data.select_dtypes(include=['object']).columns
for i in categorical_columns:
  empty_strings = cleaned_data[cleaned_data[i].isnull()]
  print(f"Empty strings in column '{i}': {len(empty_strings)}")

Empty strings in column 'new_cell': 0
Empty strings in column 'crclscod': 0
Empty strings in column 'asl_flag': 0
Empty strings in column 'prizm_social_one': 7388
Empty strings in column 'csa': 40
Empty strings in column 'area': 40
Empty strings in column 'dualband': 1
Empty strings in column 'refurb_new': 1
Empty strings in column 'hnd_webcap': 10189
Empty strings in column 'ownrent': 33706
Empty strings in column 'dwlltype': 31909
Empty strings in column 'marital': 1732
Empty strings in column 'infobase': 22079
Empty strings in column 'HHstatin': 37923
Empty strings in column 'dwllsize': 38308
Empty strings in column 'ethnic': 1732
Empty strings in column 'kid0_2': 1732
Empty strings in column 'kid3_5': 1732
Empty strings in column 'kid6_10': 1732
Empty strings in column 'kid11_15': 1732
Empty strings in column 'kid16_17': 1732
Empty strings in column 'creditcd': 1732
Empty strings in column 'car_buy': 1732


#### Categorical columns

In [5]:
categorical_columns

Index(['new_cell', 'crclscod', 'asl_flag', 'prizm_social_one', 'csa', 'area',
       'dualband', 'refurb_new', 'hnd_webcap', 'ownrent', 'dwlltype',
       'marital', 'infobase', 'HHstatin', 'dwllsize', 'ethnic', 'kid0_2',
       'kid3_5', 'kid6_10', 'kid11_15', 'kid16_17', 'creditcd', 'car_buy'],
      dtype='object')

#### Missing value imputation

In [23]:
# Iterating over numerical columns and filling missing values with mean when range is under a threshold
range_threshold = 500
mean_im=0
med_im=0
for column in cleaned_data.select_dtypes(include='number'):
    column_range = cleaned_data[column].max() - cleaned_data[column].min()
    if column_range < range_threshold:
        mean_value = np.mean(cleaned_data[column])
        cleaned_data[column].fillna(mean_value, inplace=True)
        mean_im+=1
    else:
        cleaned_data[column].fillna(cleaned_data[column].median(), inplace=True)
        med_im+=1
print(mean_im);print(med_im)
# Iterating over categorical columns and filling missing values with mode
mode_im=0
for column in cleaned_data.select_dtypes(include='object'):
    mode_value = cleaned_data[column].mode().iloc[0]
    cleaned_data[column].fillna(mode_value, inplace=True)
    mode_im+=1
print(mode_im)

41
87
23


#### Data transformation (not good to include categoricals as it may give perfect multicollinearity)

In [7]:
# # Selecting categorical and numerical columns
# categorical_cols = cleaned_data.select_dtypes(include='object').columns
# numerical_cols = cleaned_data.select_dtypes(include='number').columns

# # Performing one-hot encoding on categorical columns
# cleaned_data_encoded = pd.get_dummies(cleaned_data[categorical_cols], drop_first=True)

# # Concatenating encoded categorical columns with numerical columns of cleaned_data
# data_processed = pd.concat([cleaned_data[numerical_cols], cleaned_data_encoded], axis=1)

# data_processed.head()

####  Feature selection using Variance inflation factor and finding the best threshold

In [8]:
# # Split the data into features (X) and target (y)
# X = cleaned_data.drop(columns=categorical_columns).drop(columns=['churn'])  # Adjust column names as needed
# y = cleaned_data['churn']

# # Split the data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # Calculate VIF for each feature
# vif_values = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
# vif_data = pd.DataFrame({"Variable": X.columns, "VIF": vif_values})

# # Function to perform feature selection based on VIF threshold
# def feature_selection_with_vif(data_frame, vif_df, vif_threshold):
#     low_vif_columns = list(vif_df.loc[vif_df['VIF'] < vif_threshold, 'Variable'])
#     #data_frame_selected = data_frame.drop(columns=high_vif_columns)
#     return low_vif_columns

# # Function to train logistic regression and calculate accuracy
# def train_logistic_regression(X_train, X_test, y_train, y_test):
#     model = LogisticRegression()
#     model.fit(X_train, y_train)
#     y_pred = model.predict(X_test)
#     accuracy = accuracy_score(y_test, y_pred)
#     return accuracy

# # Hyperparameter tuning loop
# best_accuracy = 0.
# best_vif_threshold = 0.
# for i in np.linspace(np.min(vif_data.VIF)+0.00001,100,200):  # Adjust the range as needed
#     X_train_selected = X_train.loc[:,feature_selection_with_vif(X_train, vif_data, vif_threshold=i)]
#     X_test_selected = X_test.loc[:,X_train_selected.columns]  # Update X_test accordingly

#     accuracy = train_logistic_regression(X_train_selected, X_test_selected, y_train, y_test)

#     if accuracy > best_accuracy:
#         best_accuracy = accuracy
#         best_vif_threshold = i
# print(f"Best VIF Threshold: {best_vif_threshold}")
# print(f"Best Test Accuracy: {best_accuracy}")

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from statsmodels.stats.outliers_influence import variance_inflation_factor
import pandas as pd
import numpy as np

# Assume cleaned_data contains your preprocessed data

# Split the data into features (X) and target (y)
X = cleaned_data.drop(columns=categorical_columns).drop(columns=['churn'])  # Adjust column names as needed
y = cleaned_data['churn']

# Split the data into training, validation, and testing sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Calculate VIF for each feature
vif_values = [variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])]
vif_data = pd.DataFrame({"Variable": X_train.columns, "VIF": vif_values})

# Function to perform feature selection based on VIF threshold
def feature_selection_with_vif(data_frame, vif_df, vif_threshold):
    low_vif_columns = list(vif_df.loc[vif_df['VIF'] < vif_threshold, 'Variable'])
    return low_vif_columns

# Function to train logistic regression and calculate accuracy
def train_logistic_regression(X_train, X_val, y_train, y_val):
    model = LogisticRegression()
    model.fit(X_train, y_train)
    y_pred_val = model.predict(X_val)
    accuracy_val = accuracy_score(y_val, y_pred_val)
    return accuracy_val

# Hyperparameter tuning loop
best_accuracy = 0.
best_vif_threshold = 0.
for i in np.linspace(np.min(vif_data.VIF)+0.00001, 100, 200):  # Adjust the range as needed
    X_train_selected = X_train.loc[:, feature_selection_with_vif(X_train, vif_data, vif_threshold=i)]
    X_val_selected = X_val.loc[:, X_train_selected.columns]  # Update X_val accordingly

    accuracy_val = train_logistic_regression(X_train_selected, X_val_selected, y_train, y_val)

    if accuracy_val > best_accuracy:
        best_accuracy = accuracy_val
        best_vif_threshold = i

print(f"Best VIF Threshold: {best_vif_threshold}")
print(f"Best Validation Accuracy: {best_accuracy}")

X_train_selected = X_train.loc[:,feature_selection_with_vif(X_train, vif_data, vif_threshold=best_vif_threshold)]
X_test_selected = X_test.loc[:,X_train_selected.columns]

# # Now, you can retrain the model using the best hyperparameters on the combined training and validation sets
# X_combined = pd.concat([X_train_selected, X_val_selected])
# y_combined = pd.concat([y_train, y_val])

final_model_accuracy = train_logistic_regression(X_train_selected, X_test_selected, y_train, y_test)
print(f"Final Test Accuracy: {final_model_accuracy}")


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Best VIF Threshold: 50.27488369056178
Best Validation Accuracy: 0.5823333333333334
Final Test Accuracy: 0.5692


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [19]:
len(X_train_selected.columns)

64

#### Selected features from given threshold

In [10]:
print(feature_selection_with_vif(X_train, vif_data, vif_threshold=best_vif_threshold))

['totmrc_Mean', 'da_Mean', 'roam_Mean', 'rev_Range', 'mou_Range', 'totmrc_Range', 'da_Range', 'datovr_Range', 'roam_Range', 'change_mou', 'change_rev', 'recv_sms_Mean', 'custcare_Mean', 'threeway_Mean', 'owylis_vce_Mean', 'mouowylisv_Mean', 'iwylis_vce_Mean', 'mouiwylisv_Mean', 'callfwdv_Mean', 'callwait_Mean', 'drop_vce_Range', 'drop_dat_Range', 'blck_vce_Range', 'blck_dat_Range', 'unan_vce_Range', 'unan_dat_Range', 'recv_vce_Range', 'recv_sms_Range', 'custcare_Range', 'inonemin_Range', 'threeway_Range', 'mou_cvce_Range', 'mou_rvce_Range', 'owylis_vce_Range', 'mouowylisv_Range', 'iwylis_vce_Range', 'mouiwylisv_Range', 'peak_vce_Range', 'mou_peav_Range', 'mou_pead_Range', 'opk_vce_Range', 'mou_opkv_Range', 'drop_blk_Range', 'callfwdv_Range', 'callwait_Range', 'months', 'uniqsubs', 'actvsubs', 'avgrev', 'avg6rev', 'hnd_price', 'phones', 'models', 'truck', 'mtrcycle', 'rv', 'lor', 'age1', 'age2', 'adults', 'income', 'numbcars', 'forgntvl', 'eqpdays']


#### VIF values of each column in the cleaned data

In [11]:
vif_data

Unnamed: 0,Variable,VIF
0,rev_Mean,350.528978
1,mou_Mean,474.825324
2,totmrc_Mean,36.392065
3,da_Mean,4.510902
4,ovrmou_Mean,58.855797
...,...,...
122,income,11.197398
123,numbcars,13.963365
124,forgntvl,1.113005
125,eqpdays,9.668210


#### Classification using Multilayer Perceptron

In [12]:
# Initialize the MLP classifier
clf = MLPClassifier(hidden_layer_sizes=(100, 100), activation='logistic', solver='adam', random_state=42)

# Train the classifier on the training data
clf.fit(X_train_selected, y_train)

# Make predictions on the test data
y_pred = clf.predict(X_test_selected)

# Calculate the accuracy of the classifier
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Accuracy: 0.5890666666666666




#### Classification using KNN

In [13]:
X_val_selected = X_val.loc[:,X_train_selected.columns]

# Assuming X_train_selected and X_test_selected are sparse matrices
X_train_selected_dense = np.asarray(csr_matrix(X_train_selected).todense())
X_val_selected_dense = np.asarray(csr_matrix(X_val_selected).todense())
X_test_selected_dense = np.asarray(csr_matrix(X_test_selected).todense())

best_accuracy=0.
for i in range(1,100,2):
    # Initialize the KNN classifier
    knn = KNeighborsClassifier(n_neighbors=i)
    
    # Train the classifier on the training data
    knn.fit(X_train_selected_dense, y_train)
    
    # Make predictions on the validation data
    y_pred = knn.predict(X_val_selected_dense)
    
    # Calculate the accuracy of the classifier
    accuracy = accuracy_score(y_val, y_pred)
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_no_of_neighbours = i
print(f"Best number of neighbours: {best_no_of_neighbours}")
print(f"Best Validation Accuracy: {best_accuracy}")

knn = KNeighborsClassifier(n_neighbors=best_no_of_neighbours)
    
# Train the classifier on the training data
knn.fit(X_train_selected_dense, y_train)

# Make predictions on the test data
y_pred = knn.predict(X_test_selected_dense)
final_model_accuracy = accuracy_score(y_test, y_pred)
print(f"Final Test Accuracy: {final_model_accuracy}")


Best number of neighbours: 89
Best Validation Accuracy: 0.5855333333333334
Final Test Accuracy: 0.5725333333333333


#### Classification using Decision Tree

In [14]:
# Create a decision tree classifier
clf = DecisionTreeClassifier(random_state=42)

# Train the classifier on the training data
clf.fit(X_train_selected, y_train)

# Make predictions on the testing data
y_pred = clf.predict(X_test_selected)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)

# Print the evaluation metrics
print("Accuracy:", accuracy)

Accuracy: 0.5451333333333334


#### Classification using Random Forest

In [15]:
# Create a Random Forest classifier
clf = RandomForestClassifier(random_state=42)

# Train the classifier on the training data
clf.fit(X_train_selected, y_train)

# Make predictions on the testing data
y_pred = clf.predict(X_test_selected)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)

# Print the evaluation metrics
print("Accuracy:", accuracy)

Accuracy: 0.6152666666666666


#### Classification using XGBoost

In [16]:
# Create an XGBoost classifier
clf = xgb.XGBClassifier(random_state=42)

# Train the classifier on the training data
clf.fit(X_train_selected, y_train)

# Make predictions on the testing data
y_pred = clf.predict(X_test_selected)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)

# Print the evaluation metrics
print("Accuracy:", accuracy)

Accuracy: 0.6205333333333334


#### Classification using Support vector machine

In [17]:
# Standardize the data (optional but recommended for SVM)
scaler = StandardScaler()
X_train_selected_standardised = scaler.fit_transform(X_train_selected)
X_test_selected_standardised = scaler.transform(X_test_selected)

# Create an SVM classifier
svm_classifier = SVC(kernel='rbf', C=1.0, gamma='scale', random_state=42)

# Train the SVM classifier
svm_classifier.fit(X_train_selected_standardised, y_train)

# Make predictions on the testing data
y_pred = svm_classifier.predict(X_test_selected_standardised)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)

# Print the evaluation metrics
print("Accuracy:", accuracy)

Accuracy: 0.5881333333333333
