# Appendix

### AM41UD - Understanding Data

### PRANAV THIAGARAJAN UMAPATHY - 220366757

## Part - A

In [None]:
#importing required packages
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("darkgrid")
from scipy.stats import mode
import warnings
warnings.filterwarnings('ignore')

In [None]:
#reading data from the .txt file
train_dt = pd.read_csv("UD_Train.txt", sep=",", header=None, names=["User", "Product", "Rating"])
test_dt = pd.read_csv("UD_Test.txt", sep=",", header=None, names=["User", "Product", "Rating"])

In [None]:
#data pre-processing on train data
train_dt['User'] = train_dt['User'].str.replace(r'\(User', '', regex=True)
train_dt['Product'] = train_dt['Product'].str.replace(r'Product', '', regex=True)
train_dt['Rating'] = train_dt['Rating'].str.replace(r'Rating', '', regex=True)
train_dt['Rating'] = train_dt['Rating'].str.replace(r'\)', '', regex=True)

train_dt['User'] = train_dt['User'].astype(int)
train_dt['Product'] = train_dt['Product'].astype(int)
train_dt['Rating'] = train_dt['Rating'].astype(float)

In [None]:
#data pre-processing on test data
test_dt['User'] = test_dt['User'].str.replace(r'\(User', '', regex=True)
test_dt['Product'] = test_dt['Product'].str.replace(r'Product test', '', regex=True)
test_dt['Rating'] = test_dt['Rating'].str.replace(r'Rating', '', regex=True)
test_dt['Rating'] = test_dt['Rating'].str.replace(r'\)', '', regex=True)

test_dt['User'] = test_dt['User'].astype(int)
test_dt['Product'] = test_dt['Product'].astype(int)
test_dt['Rating'] = test_dt['Rating'].astype(float)

In [None]:
#finding the total number of users and products 
total_users = train_dt["User"].nunique()
total_products = train_dt["Product"].nunique()

#printing the results
print("Number of users:", total_users)
print("Number of products:", total_products)

In [None]:
#pivoting the data to create the 'y' dataframe
y = train_dt.pivot(index='User', columns='Product', values='Rating')

#dimensions of 'y'
num_users = y.shape[0]
num_products = y.shape[1]

#print the results
print("Dimensions of y: ", y.shape)
print("Number of users: ", num_users)
print("Number of products: ", num_products)

In [None]:
#calculate the average rating of each product
average_ratings = train_dt.groupby('Product')['Rating'].mean()

#histogram of average ratings
plt.hist(average_ratings, bins=10,color='teal', edgecolor='black')
plt.xlabel('Average Rating')
plt.ylabel('Frequency')
plt.title('Histogram - Average Ratings')
plt.show()

In [None]:
#calculating the average rating of each product
average_ratings = train_dt.groupby('Product')['Rating'].mean()

#sort the average ratings in ascending order & select the top 5 products
worst_products = average_ratings.sort_values(ascending=True).head(5)

#print the worst products
print("The 5 worst products:")
print(worst_products)

In [None]:
#calculating the average rating given by each user
average_ratings = train_dt.groupby('User')['Rating'].mean()

#histogram of average ratings
plt.hist(average_ratings, bins=10,color='teal', edgecolor='black')
plt.xlabel('Average Rating')
plt.ylabel('Frequency')
plt.title('Histogram - Average Ratings by User')
plt.show()

In [None]:
#calculating the average rating given by each user
average_ratings = train_dt.groupby('User')['Rating'].mean()

#sort the average ratings in descending order & select the top 5 users
most_generous_users = average_ratings.sort_values(ascending=False).head(5)

#print the 5 most generous users based on avg. rating
print("The 5 most generous users:")
print(most_generous_users)

In [None]:
#finding the total number of users and products 
total_users = test_dt["User"].nunique()
total_products = test_dt["Product"].nunique()

#printing the results
print("Number of users:", total_users)
print("Number of products:", total_products)

In [None]:
#pivot the data to create the 'X' dataframe
X = test_dt.pivot(index='User', columns='Product', values='Rating')

#dimensions of 'X'
num_users = X.shape[0]
num_products = X.shape[1]

#print the results
print("Dimensions of X: ", X.shape)
print("Number of users: ", num_users)
print("Number of products: ", num_products)

In [None]:
#create dictionaries to store the distances between products and top 5 similar products
product_distances = {}
top_similar_products = {}
top_similar_products_with_scores_1={}

#iterating each product in the test data
for test_product in test_dt["Product"].unique():
    distances = {}
    test_ratings = test_dt[test_dt["Product"] == test_product].set_index("User")["Rating"]
    
    #iterating each product in the train data
    for train_product in train_dt["Product"].unique():
        train_ratings = train_dt[train_dt["Product"] == train_product].set_index("User")["Rating"]
        
        #calculate the distance between the products using the formula
        distance = (test_ratings - train_ratings).abs().sum()
        distances[train_product] = distance
    
    #the top 5 most similar products for the current test product
    similar_products = sorted(distances, key=distances.get)[:5]
    product_distances[test_product] = distances
    top_similar_products[test_product] = similar_products
    top_similar_products_with_scores_1[test_product] = sorted(distances.items(),key=lambda x:x[1])[:5]

#print the top 5 similar products for each test product
print("Top 5 similar products:")
for test_product, similar_products in top_similar_products.items():
    print("Test Product:", test_product, "-> Similar Products:", similar_products)

In [None]:
top_similar_products_with_scores_1

In [None]:
#create dictionaries to store the distances between products and top 5 similar products
product_distances = {}
top_similar_products = {}
top_similar_products_with_scores_2 = {}

#iterating over each product in the test data
for test_product in test_dt["Product"].unique():
    distances = {}
    test_ratings = test_dt[test_dt["Product"] == test_product].set_index("User")["Rating"]
    
    #iterating each product in the train data
    for train_product in train_dt["Product"].unique():
        train_ratings = train_dt[train_dt["Product"] == train_product].set_index("User")["Rating"]
        
        #calculate the distance between the products using the Euclidean method
        distance = np.sqrt(((test_ratings - train_ratings) ** 2).sum())
        distances[train_product] = distance
    
    #the top 5 most similar products for the current test product
    similar_products = sorted(distances, key=distances.get)[:5]
    product_distances[test_product] = distances
    top_similar_products[test_product] = similar_products
    top_similar_products_with_scores_2[test_product] = sorted(distances.items(), key=lambda x: x[1])[:5]

#print the top 5 similar products for each test product
print("Top 5 similar products:")
for test_product, similar_products in top_similar_products.items():
    print("Test Product:", test_product, "-> Similar Products:", similar_products)

In [None]:
top_similar_products_with_scores_2

In [None]:
# Part - B

In [None]:
#reading data from the given file
churn_df = pd.read_csv("Group 2.csv")

In [None]:
#data preparation
churn_df.columns #display the columns

In [None]:
churn_df.head()  #display the first few rows

In [None]:
churn_df.shape  #display the dimensions of the data

In [None]:
churn_df.info()  #display information about the columns, data types, and missing values

In [None]:
churn_df.describe()  #statistical summary of the numerical columns

In [None]:
churn_df.isna().sum() #null count in each column

In [None]:
churn_df.isna().any() #check null in columns

In [None]:
#### Data Pre-processing

In [None]:
#removing unwanted columns
churn_df.drop(churn_df.columns[churn_df.columns.str.contains('unnamed',case = False)],axis = 1, inplace = True)
# churn_df.drop(churn_df.columns[churn_df.columns.str.contains('customer_id',case = False)],axis = 1, inplace = True)

In [None]:
#removing duplicates
churn_df.drop_duplicates(inplace=True)

In [None]:
#removing unwanted characters from Class column
churn_df['Class'] = churn_df['Class'].str.replace(r'Churn=', '', regex=True)

In [None]:
#filling the special characters in a column
churn_df['Class'] = churn_df['Class'].replace({'Y$e$s$$': 'Yes'})

In [None]:
#drop null values from class column
churn_df = churn_df.dropna(subset=['Class'])

In [None]:
#fill no-reply in survey column by taking mode
mod_value = mode(pd.to_numeric(churn_df['survey'], errors='coerce')).mode[0]
churn_df['survey'] = churn_df['survey'].replace('No reply', np.nan).fillna(mod_value).astype(int)

In [None]:
#fill unknown in dependent column by taking median
#converting non-numeric values to NaN
churn_df['dependents'] = pd.to_numeric(churn_df['dependents'], errors='coerce')
#calculating the median excluding NaN values
med_value = np.nanmedian(churn_df['dependents'])
#fill unknown values with the median
churn_df['dependents'] = churn_df['dependents'].fillna(med_value).astype(int)

In [None]:
#remove -ve sign and round the values in tenure
churn_df['Tenure'] = churn_df['Tenure'].apply(lambda x: abs(x)).round().astype(int)

In [None]:
#fill monthly cost column based on each package value
churn_df['monthly_cost'] = 0

In [None]:
#fill monthly cost column based on package prices given
churn_df['monthly_cost'] = np.where(churn_df['package'] == 1, '26', 
                            np.where(churn_df['package'] == 2, '34', 
                                     np.where(churn_df['package'] == 3, '40', '50'))).astype(int)

In [None]:
#creating a new column total_cost
churn_df['total_cost'] = churn_df['monthly_cost'] * churn_df['Tenure']

In [None]:
#arranging the columns before implementing models
churn_df = churn_df[['gender', 'location','partner','dependents','senior','Tenure','monthly_cost'
                     ,'package','survey','total_cost','Class']]

In [None]:
#creating a report of the dataframe
def report(churn_df):
    col = []
    d_type = []
    uniques = []
    n_uniques = []
    
    for i in churn_df.columns:
        col.append(i)
        d_type.append(churn_df[i].dtypes)
        uniques.append(churn_df[i].unique()[:5])
        n_uniques.append(churn_df[i].nunique())
    
    return pd.DataFrame({'Columns': col, 'data_type': d_type, 'unique_samples': uniques, 'n_uniques': n_uniques})
report(churn_df)

In [None]:
#### Exploratory Data Analysis

In [None]:
#plot histograms of numerical features
churn_df.hist(figsize=(10, 8))
plt.tight_layout()
plt.show()

#create a correlation matrix heatmap
#subset of columns for correlation matrix
colm_subset = ['Tenure', 'monthly_cost', 'total_cost','dependents']
corr_matrix = churn_df[colm_subset].corr()
# Create the correlation matrix heatmap
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation Matrix Heatmap")
plt.show()

In [None]:
#make a copy of the DataFrame to avoid modifying the original data
churn_df_1 = churn_df.copy()

#mapping 1 to "Male" and 0 to "Female" in the 'gender' column
# churn_df_1['gender'] = churn_df_1['gender'].map({1: 'Male', 0: 'Female'})

by_gender = churn_df_1.groupby('Class')['gender'].value_counts().to_frame().rename(columns={'gender': 'Freq'}).reset_index().sort_values('Class')

#sorting data
group_names = churn_df_1['Class'].value_counts().index
group_size = churn_df_1['Class'].value_counts()
subgroup_names = by_gender['gender']
subgroup_size = by_gender['Freq']

#assigning colors for the pie chart
a, b = [plt.cm.Blues, plt.cm.Reds]

#first Ring (outside)
fig, ax = plt.subplots()
fig.suptitle('Churn by Gender')
ax.axis('equal')
mypie, _ = ax.pie(group_size, radius=1.3, labels=group_names, colors=[a(0.6), b(0.6)])
plt.setp(mypie, width=0.3, edgecolor='white')

#second Ring (Inside)
mypie2, _ = ax.pie(subgroup_size, radius=1.3 - 0.3, labels=subgroup_names, labeldistance=0.7, colors=[a(0.5), a(0.4), b(0.5), b(0.4)])
plt.setp(mypie2, width=0.4, edgecolor='white')
plt.margins(0, 0)
plt.show()

In [None]:
plt.figure(figsize=(16, 5))

plt.subplot(1, 2, 1)
plt.title('Monthly Charge distribution')
sns.distplot(churn_df_1[churn_df_1['Class'] == 'Yes']['monthly_cost'], label='Churn')
plt.legend()

plt.subplot(1, 2, 2)
plt.title('Monthly Charge distribution Split by Gender')
sns.distplot(churn_df_1[(churn_df_1['Class'] == 'Yes') & (churn_df_1['gender'] == 'Male')]['monthly_cost'], label='Male')
sns.distplot(churn_df_1[(churn_df_1['Class'] == 'Yes') & (churn_df_1['gender'] == 'Female')]['monthly_cost'], label='Female')
plt.legend()

plt.show()

In [None]:
plt.figure(figsize=(10,5))
plt.title('Monthly Charge distribution')
sns.distplot(churn_df_1[churn_df_1['Class'] == 'Yes']['monthly_cost'], label='Churn')
sns.distplot(churn_df_1[churn_df_1['Class'] == 'No']['monthly_cost'], label='Retain')
plt.legend(loc= 'upper right')
plt.show()

In [None]:
#plot categorical feature analysis
categorical_features = ['gender', 'partner', 'dependents', 'survey', 'package']

plt.figure(figsize=(14, 10))
for i, feature in enumerate(categorical_features, 1):
    plt.subplot(2, 3, i)
    sns.countplot(x=feature, hue='Class', data=churn_df)
    plt.title(f'{feature} vs. Churn')
    plt.legend(title='Class', loc='upper right', labels=['No', 'Yes'])
plt.tight_layout()
plt.show()

In [None]:
#### Importing required packages

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.feature_selection import RFE
import shap
from sklearn import preprocessing 
label_encoder = preprocessing.LabelEncoder()
from sklearn.metrics import cohen_kappa_score

In [None]:
#encoding categorical variables
label_encoder = LabelEncoder()
churn_df['gender'] = label_encoder.fit_transform(churn_df['gender'])
churn_df['location'] = label_encoder.fit_transform(churn_df['location'])
churn_df['partner'] = label_encoder.fit_transform(churn_df['partner'])
churn_df['dependents'] = label_encoder.fit_transform(churn_df['dependents'])
churn_df['package'] = label_encoder.fit_transform(churn_df['package'])
churn_df['survey'] = label_encoder.fit_transform(churn_df['survey'])

In [None]:
#### Splitting the dataset into test and train

In [None]:
#selecting the required columns
columns = ['gender', 'location', 'partner', 'dependents', 'senior', 'Tenure', 'monthly_cost',
           'package', 'survey', 'total_cost']
X = churn_df[columns]
y = churn_df['Class']

#splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=64)

#the shape of train and test sets
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

In [None]:
##### Implementing statistical models

In [None]:
#initializing Decision Tree Classifier
decision_tree_classifier = DecisionTreeClassifier(random_state=42)
decision_tree_classifier.fit(X_train, y_train)
y_pred = decision_tree_classifier.predict(X_test)

#evaluating the model
print("Decision Tree Classifier:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))

#calculating Cohen's Kappa
cohen_kappa = cohen_kappa_score(y_test, y_pred)
print("Cohen's Kappa Score:", cohen_kappa)

In [None]:
#initializing the Random Forest model
random_forest_model = RandomForestClassifier(random_state=42)

#train the model on the training data
random_forest_model.fit(X_train, y_train)

#predict on the test data
y_pred = random_forest_model.predict(X_test)

#calculating the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Random forest Classifier:")
print("Accuracy:", accuracy)

#print the classification report

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

#calculating Cohen's Kappa
cohen_kappa = cohen_kappa_score(y_test, y_pred)
print("Cohen's Kappa Score:", cohen_kappa)

In [None]:
#initializing the logistic regression model
logistic_model = LogisticRegression()

#train the model on the training data
logistic_model.fit(X_train, y_train)

#predict on the test data
y_pred = logistic_model.predict(X_test)

#calculating the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Logistic regression Classifier:")
print("Accuracy:", accuracy)

#print the classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

#calculating Cohen's Kappa
cohen_kappa = cohen_kappa_score(y_test, y_pred)
print("Cohen's Kappa Score:", cohen_kappa)

In [None]:
#encode the target variable into numerical format for XGBoost
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)

#train the XGBoost classifier
xgb_classifier = XGBClassifier(random_state=42)
xgb_classifier.fit(X_train, y_train_encoded)

#making predictions on the test set
y_pred_encoded = xgb_classifier.predict(X_test)

#decode the predicted labels back to string format
y_pred = label_encoder.inverse_transform(y_pred_encoded)

#evaluating the model
accuracy = accuracy_score(y_test, y_pred)
print("Model Accuracy:", accuracy)
print("Classification Report:")
print(classification_report(y_test, y_pred))

#calculating Cohen's kappa
cohen_kappa = cohen_kappa_score(y_test, y_pred)
print("Cohen's Kappa:", cohen_kappa)

In [None]:
#### Hyperparameter tuning

In [None]:
#the parameter grid for hyperparameter tuning
param_grid = {
    'learning_rate': [0.05, 0.1, 0.3],
    'max_depth': [3, 5, 7],
    'n_estimators': [100, 200, 300],
    'min_child_weight': [1, 3, 5],
}

#initializing the XGBoost classifier
xgb_classifier = XGBClassifier(random_state=42)

#hyperparameter tuning using GridSearchCV
grid_search = GridSearchCV(xgb_classifier, param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train_encoded)

best_params = grid_search.best_params_

#train the XGBoost classifier with the best hyperparameters
xgb_classifier_best = XGBClassifier(**best_params, random_state=42)
xgb_classifier_best.fit(X_train, y_train_encoded)

#making predictions on the test set
y_pred_encoded = xgb_classifier_best.predict(X_test)

#decode the predicted labels back to string format
y_pred = label_encoder.inverse_transform(y_pred_encoded)

#evaluating the model with the best hyperparameters
accuracy = accuracy_score(y_test, y_pred)
print("Model Accuracy:", accuracy)
print("Classification Report:")
print(classification_report(y_test, y_pred))

#calculate Cohen's kappa
cohen_kappa = cohen_kappa_score(y_test, y_pred)
print("Cohen's Kappa:", cohen_kappa)

In [None]:
from sklearn.preprocessing import LabelEncoder

# Encode the target variable into numerical format
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

#creating the XGBoost classifier
xgb_classifier = XGBClassifier(random_state=42)

#training the XGBoost classifier on the full feature set
xgb_classifier.fit(X_train, y_train_encoded)

#show feature importances from the trained XGBoost model
feature_importances = xgb_classifier.feature_importances_

#creating a DataFrame to store feature importances
feature_importance_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': feature_importances})

#sorting the DataFrame by feature importances in descending order
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

#print the top 'num_features_to_keep' features based on importance
num_features_to_keep = 5
selected_features_xgb = feature_importance_df.head(num_features_to_keep)['Feature'].values
print("Important Features:", selected_features_xgb)

# Keep only the selected features in the training and test datasets
X_train_selected = X_train[selected_features_xgb]
X_test_selected = X_test[selected_features_xgb]

# Initialize a new XGBoost classifier with the selected features
xgb_classifier_selected = XGBClassifier(random_state=42)

# Train the new XGBoost classifier on the selected feature set
xgb_classifier_selected.fit(X_train_selected, y_train_encoded)

# Make predictions on the test set using the new classifier
y_pred_encoded = xgb_classifier_selected.predict(X_test_selected)

# Decode the predicted labels back to string format
y_pred = label_encoder.inverse_transform(y_pred_encoded)

# Evaluate the model with the selected features
accuracy_selected = accuracy_score(y_test, y_pred)
print("Model Accuracy with Selected Features:", accuracy_selected)
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Calculate Cohen's kappa with the selected features
cohen_kappa_selected = cohen_kappa_score(y_test, y_pred)
print("Cohen's Kappa with Selected Features:", cohen_kappa_selected)

In [None]:
#feature importances from the trained XGBoost model
feature_importances = xgb_classifier.feature_importances_

#creating a DataFrame to store feature importances
feature_importance_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': feature_importances})

#sorting the DataFrame by feature importances in descending order
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

#print the most important feature
most_important_feature = feature_importance_df.iloc[0]['Feature']
print("Most Important Feature:", most_important_feature)

In [None]:
from sklearn.preprocessing import LabelEncoder

# Encode the target variable into numerical format
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

#creating the XGBoost classifier
xgb_classifier = XGBClassifier(random_state=42)

#training the XGBoost classifier on the full feature set
xgb_classifier.fit(X_train, y_train_encoded)

#creating a DataFrame to store feature importances
feature_importance_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': xgb_classifier.feature_importances_})

#sorting the DataFrame by feature importances in descending order
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

#print feature importances
print("Feature Importance & Scores:")
print(feature_importance_df)

#initializing the SHAP (SHapley Additive exPlanations) explainer object
explainer = shap.Explainer(xgb_classifier)

#calculating SHAP values for the entire test dataset
shap_values = explainer.shap_values(X_test)

#summary plot showing the overall feature importance based on SHAP values
shap.summary_plot(shap_values, X_test, plot_type='dot')