# Student Number:21203781
# This Notebook Includes Classification Using Tabular Data and Text Data, by Implementing Different Machine Learning Models

In [1]:
#Importing all necessary packages
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import lazypredict
from lazypredict.Supervised import LazyClassifier
from sklearn.metrics import balanced_accuracy_score

In [2]:
#Importing the tabular data
df = pd.read_csv(r"E:\Data Science-UCD\bank-data\bank-tabular.csv", sep="\t")
df

Unnamed: 0,customer_id,date,customer_gender,customer_age,customer_location,customer_type,has_cc,has_mortgage,convenience,customer_service,online_banking,interest_rates,fees_charges,community_involvement,products_services,privacy_security,reputation,satisfied
0,216604,2022-08-22,Male,50.00,Munster,Personal,True,False,4.00,5.00,4.00,4.00,4.00,4.00,5.00,2.00,4.00,True
1,259276,2022-11-23,Female,61.00,Leinster,Personal,True,False,5.00,5.00,5.00,3.00,5.00,4.00,4.00,5.00,5.00,True
2,265459,2022-01-21,Female,63.00,Munster,Business,True,False,2.00,2.00,5.00,5.00,2.00,,4.00,4.00,,True
3,58770,2022-03-13,f,,Leinster,Business,True,False,,4.00,5.00,1.00,5.00,5.00,5.00,5.00,5.00,True
4,318031,2022-08-08,Female,41.00,Leinster,Personal,True,True,1.00,1.00,1.00,1.00,2.00,2.00,4.00,5.00,2.00,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,322582,2021-09-23,Male,41.00,Munster,Personal,True,True,3.00,3.00,3.00,3.00,5.00,3.00,3.00,1.00,5.00,False
2996,53418,2021-03-07,f,57.00,Munster,Business,False,False,3.00,2.00,5.00,1.00,2.00,2.00,2.00,2.00,3.00,True
2997,79364,2021-08-01,m,,Munster,Personal,True,True,3.00,3.00,3.00,4.00,4.00,3.00,4.00,4.00,4.00,False
2998,371134,2021-06-25,m,42.00,Leinster,Business,False,False,3.00,2.00,1.00,5.00,4.00,4.00,3.00,4.00,1.00,True


In [3]:
#Importing the Text Data
df_1 = pd.read_csv(r"E:\Data Science-UCD\bank-data\bank-comments.csv", sep="\t")
df_1

Unnamed: 0,customer_id,date,comments
0,216604,2022-08-22,"Overal, this bank is satisfactory."
1,259276,2022-11-23,Easy to find zhe bank ' s branches and ATMs. A...
2,265459,2022-01-21,Bank's phone app is really great. In general a...
3,58770,2022-03-13,
4,318031,2022-08-08,
...,...,...,...
2995,322582,2021-09-23,No comment
2996,53418,2021-03-07,Online banking is really good
2997,79364,2021-08-01,customer service quality from this bank is ter...
2998,371134,2021-06-25,Great to see that my bank supports local sport...


In [4]:
#Assigning the New Data Frame as a Combination of both the Text and Tabular Data
df2 = pd.DataFrame().assign(customer_id=df['customer_id'], comments=df_1['comments'], satisfied=df['satisfied']).set_index('customer_id')
df2

Unnamed: 0_level_0,comments,satisfied
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1
216604,"Overal, this bank is satisfactory.",True
259276,Easy to find zhe bank ' s branches and ATMs. A...,True
265459,Bank's phone app is really great. In general a...,True
58770,,True
318031,,False
...,...,...
322582,No comment,False
53418,Online banking is really good,True
79364,customer service quality from this bank is ter...,False
371134,Great to see that my bank supports local sport...,True


## Data Preparation

In [5]:
#converting boolean data for dependent variable to binary
for col_name in df2.columns:
    if(df2[col_name].dtype == 'bool'):
        df2[col_name]= df2[col_name].astype('category')
        df2[col_name] = df2[col_name].cat.codes
df2

Unnamed: 0_level_0,comments,satisfied
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1
216604,"Overal, this bank is satisfactory.",1
259276,Easy to find zhe bank ' s branches and ATMs. A...,1
265459,Bank's phone app is really great. In general a...,1
58770,,1
318031,,0
...,...,...
322582,No comment,0
53418,Online banking is really good,1
79364,customer service quality from this bank is ter...,0
371134,Great to see that my bank supports local sport...,1


In [6]:
#checking for missing comments
df2.isna().sum()

comments     515
satisfied      0
dtype: int64

In [7]:
#replacing missing comments with most frequently occuring comment, as the number of missing data points is almost 20% the total data
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='most_frequent')
df2['comments'] = imputer.fit_transform(df2[['comments']])


## Data Split for training

In [8]:
#Assigning Target Variable for Training
target = df2["satisfied"]
target.value_counts()

0    1737
1    1263
Name: satisfied, dtype: int64

In [9]:
# the content for all documents
documents = df2["comments"]
# apply the vectorization process
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(min_df = 10, stop_words=None)
X = vectorizer.fit_transform(documents)
# check the size of the resulting representation
print(X.shape)

(3000, 342)


In [10]:
# check the number of terms/words in our preprocessed vocabulary
terms = vectorizer.get_feature_names_out()
print("Vocabulary has %d distinct terms" % len(terms))

Vocabulary has 342 distinct terms


In [11]:
# perform the train-test split 
from sklearn.model_selection import train_test_split
data_train, data_test, target_train, target_test = train_test_split(X, target, test_size=0.3)
# we will just check how many tweets in each set
print("Training set has %d tweets" % data_train.shape[0] )
print("Test set has %d tweets" % data_test.shape[0] )

Training set has 2100 tweets
Test set has 900 tweets


## Training Different Machine Learning Models to be Trained on the Data, which will finally be compared at the end

In [12]:
# prepare the k-NN classification model, for 3 nearest neighbours in this case
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(n_neighbors=3)
model.fit(data_train, target_train)

In [13]:
# make predictions for the tweets in the test set
predicted = model.predict(data_test)
predicted

array([0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0,
       0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1,
       0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [14]:
# now we will evaluate the performance of the classifier
from sklearn.metrics import accuracy_score
print("Accuracy = %.4f" % accuracy_score(target_test, predicted))
print("Balanced Accuracy = %.4f" % balanced_accuracy_score(target_test, predicted))

Accuracy = 0.8489
Balanced Accuracy = 0.8263


In [15]:
#Repeating the classification and evaluation process again using a different train/test split
data_train, data_test, target_train, target_test = train_test_split(X, target, test_size=0.2)
model = KNeighborsClassifier(n_neighbors=3)
model.fit(data_train, target_train)
predicted = model.predict(data_test)
print("Accuracy = %.4f" % accuracy_score(target_test, predicted))
print("Balanced Accuracy = %.4f" % balanced_accuracy_score(target_test, predicted))

Accuracy = 0.8650
Balanced Accuracy = 0.8455


In [16]:
from sklearn.model_selection import cross_val_score
# create a single classifier
model = KNeighborsClassifier(n_neighbors=3)
# apply 5-fold cross-validation, measuring accuracy each time
acc_scores = cross_val_score(model, X, target, cv=5, scoring="accuracy")

In [17]:
# represent the results as a Pandas Series
labels = ["Fold %d" % i for i in range(1,len(acc_scores)+1)]
s_acc = pd.Series(acc_scores, index = labels)
s_acc

Fold 1   0.81
Fold 2   0.84
Fold 3   0.88
Fold 4   0.85
Fold 5   0.86
dtype: float64

In [18]:
# overall average accuracy
print("Mean accuracy: %.4f" % s_acc.mean())

Mean accuracy: 0.8470


In [19]:
#LogisticRegression
from sklearn.linear_model import LogisticRegression
lr=LogisticRegression(max_iter=1000)
lr.fit(data_train, target_train)
print("score on test: " + str(lr.score(data_test, target_test)))
print("score on train: "+ str(lr.score(data_train, target_train)))
predict_lr=lr.predict(data_test)
print("Balanced Accuracy = %.4f" % balanced_accuracy_score(target_test, predict_lr))

score on test: 0.8933333333333333
score on train: 0.88875
Balanced Accuracy = 0.8727


In [20]:
#SVM
from sklearn.svm import LinearSVC
svm=LinearSVC(C=0.0001)
svm.fit(data_train, target_train)
print("score on test: " + str(svm.score(data_test, target_test)))
print("score on train: "+ str(svm.score(data_train, target_train)))
predict_svm=svm.predict(data_test)
print("Balanced Accuracy = %.4f" % balanced_accuracy_score(target_test, predict_svm))

score on test: 0.6766666666666666
score on train: 0.69125
Balanced Accuracy = 0.6104


In [21]:
#Decision Trees
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
dt.fit(data_train, target_train)
print("score on test: "  + str(dt.score(data_test, target_test)))
print("score on train: " + str(dt.score(data_train, target_train)))
predict_dt=dt.predict(data_test)
print("Balanced Accuracy = %.4f" % balanced_accuracy_score(target_test, predict_dt))

score on test: 0.8766666666666667
score on train: 0.8979166666666667
Balanced Accuracy = 0.8561


In [22]:
#Random Forest 
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(max_depth=2, random_state=0)
rfc.fit(data_train, target_train)
print("score on test: "  + str(rfc.score(data_test, target_test)))
print("score on train: " + str(rfc.score(data_train, target_train)))
predict_rfc=rfc.predict(data_test)
print("Balanced Accuracy = %.4f" % balanced_accuracy_score(target_test, predict_rfc))

score on test: 0.6416666666666667
score on train: 0.6529166666666667
Balanced Accuracy = 0.5683


## Conclusion, Challenges and Future Work

Conclusion: Clearly, the best performing models on the textual data are Logistic Regression, Decision Trees and KNN. These models performed similarly well on both textual and tabular data.
On the other hand, Random Forest and SVM failed to produce acceptable results on textual data even though they performed well on tabular data.
Hence these classifiers are not suited for textual data.

Challenges: Since the data had multiple categorical columns, there were options of choosing from many important 'hues' for the multivariate analysis using boxplots in the first notebook, but to the best of my knowledge, this project narrowed it down to just one, which I deemed to be the most important.
Additionally, the dependent class was binary in nature, so it wasn't suitable for time series analysis. If the class had been continuous instead, like other features and had a rating option from 1 to 5, the visualisation of its trend over time would have made more sense.

Future Work: This project limited its scope to one variable in its multivariate analysis, there were options of including others which can be looked into in the future.
Furthermore, more features could be added to the data like branch accessability, response time etc and include their ratings to make model training more robust. 
More models could be used for a more extensive analysis.
Lastly, the satisfaction column should be turned into a continuous class.