# Individual Assignment #4

### Import Required Libraries

In [1]:
# Read and Write Files
import pandas as pd

# Ensembles and Classifiers
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier

# Feature Selection / Extraction
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.feature_selection import chi2, SelectKBest, SelectFromModel
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

# Natural Language Toolkit
import nltk
#nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer

# Accuracy Metrics
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, classification_report

# Miscellaneous
import numpy as np
import warnings
warnings.filterwarnings("ignore")

### Load in Datasets, Construct Training Set

In [2]:
customer_information_df = pd.read_csv("data/assignment_four/Customers.csv").set_index('ID')
comment_df = pd.read_csv("data/assignment_four/Comments.csv").set_index('ID')

In [3]:
customer_information_df.head()

Unnamed: 0_level_0,Sex,Status,Children,Est_Income,Car_Owner,Usage,Age,RatePlan,LongDistance,International,Local,Dropped,Paymethod,LocalBilltype,LongDistanceBilltype,TARGET
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1,F,S,1,38000.0,N,229.64,24.393333,3,23.56,0.0,206.08,0,CC,Budget,Intnl_discount,Cancelled
6,M,M,2,29616.0,N,75.29,49.426667,2,29.78,0.0,45.5,0,CH,FreeLocal,Standard,Current
8,M,M,0,19732.8,N,47.25,50.673333,3,24.81,0.0,22.44,0,CC,FreeLocal,Standard,Current
11,M,S,2,96.33,N,59.01,56.473333,1,26.13,0.0,32.88,1,CC,Budget,Standard,Current
14,F,M,2,52004.8,N,28.14,25.14,1,5.03,0.0,23.11,0,CH,Budget,Intnl_discount,Cancelled


In [4]:
X_train = customer_information_df.drop('TARGET', axis=1)
y_train = customer_information_df['TARGET']

### Tokenize Data

In [5]:
comment_df['tokenized_comments'] = comment_df['Comments'].apply(word_tokenize)
comment_df.head()

Unnamed: 0_level_0,Comments,tokenized_comments
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
1309,Does not like the way the phone works. It is t...,"[Does, not, like, the, way, the, phone, works,..."
3556,Wanted to know the nearest store location. Wan...,"[Wanted, to, know, the, nearest, store, locati..."
2230,Wants to know how to do text messaging. Referr...,"[Wants, to, know, how, to, do, text, messaging..."
2312,Asked how to disable call waiting. referred hi...,"[Asked, how, to, disable, call, waiting, ., re..."
3327,Needs help learning how to use the phone. I su...,"[Needs, help, learning, how, to, use, the, pho..."


### Apply English Snowball Stemmer

In [6]:
stemmer = SnowballStemmer("english")

In [7]:
snowball_df = pd.DataFrame()
snowball_df['stem_token_comments'] = comment_df['tokenized_comments'].apply(lambda x: [stemmer.stem(y) for y in x]).apply(lambda x: " ".join(x))
snowball_df.head()

Unnamed: 0_level_0,stem_token_comments
ID,Unnamed: 1_level_1
1309,doe not like the way the phone work . it is to...
3556,want to know the nearest store locat . want to...
2230,want to know how to do text messag . refer him...
2312,ask how to disabl call wait . refer him to web...
3327,need help learn how to use the phone . i sugge...


### Apply Count Vectorization

In [8]:
count_vectorizer = CountVectorizer(stop_words='english',lowercase=False)
term_document_counts = count_vectorizer.fit_transform(snowball_df['stem_token_comments'])
term_document_matrix = pd.DataFrame(term_document_counts.toarray(), columns=count_vectorizer.get_feature_names()).set_index(snowball_df.index)

### Compute Term Frequency–Inverse Document Frequency (TF-IDF) Matrix

In [9]:
tf_idf_transformer = TfidfTransformer()
tf_idf_X_train = tf_idf_transformer.fit_transform(term_document_counts)
tf_idf_df = pd.DataFrame(tf_idf_X_train.toarray(), columns=count_vectorizer.get_feature_names()).set_index(snowball_df.index)
tf_idf_df.head()

Unnamed: 0_level_0,3399,3g,abysm,access,accessori,adapt,add,addit,additon,address,...,wish,wll,wold,work,wors,worst,wrong,xvyx,year,york
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1309,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.209678,0.0,0.0,0.0,0.0,0.0,0.0
3556,0.0,0.0,0.0,0.0,0.27568,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2230,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2312,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3327,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Perform Feature Selection

In [10]:
selector = SelectKBest(score_func=chi2, k=50)
selector.fit_transform(tf_idf_df, y_train)
cols = selector.get_support(indices=True)

tf_idf_df_selected_features_df = tf_idf_df.iloc[:,cols]
tf_idf_df_selected_features_df.tail()

Unnamed: 0_level_0,address,adress,alway,bateri,chang,charg,charger,compar,complain,continu,...,sold,teach,tire,transeff,transfer,trust,turn,unlimit,weak,whi
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3034,0.0,0.0,0.0,0.0,0.0,0.446161,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
271,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
783,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1295,0.772949,0.0,0.0,0.0,0.545354,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1807,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
def accuracy_score(classifier, comparison):
    print("===ACCURACY SCORE===")
    print(round(classifier.score(comparison, y_train),6))
    print("")

In [12]:
def accuracy_report(predictions):
    print("===CONFUSION MATRIX===")
    print(confusion_matrix(y_train, predictions))
    print("")
    print("===CLASSIFICATION REPORT===")
    print(classification_report(y_train, predictions))

In [13]:
def cross_val_accuracy(cross_val_score):
    print("===ALL ACCURACY SCORES===")
    print(cross_val_score)
    print("")
    print("===MEAN ACCURACY SCORE===")
    print(round(cross_val_score.mean(),6))

# Cast Random Forest Classifier on Text Data

In [14]:
random_forest = RandomForestClassifier()
random_forest_text = random_forest.fit(tf_idf_df_selected_features_df, y_train)
random_forst_predictions = random_forest.predict(tf_idf_df_selected_features_df)
accuracy_score(random_forest, tf_idf_df_selected_features_df)
accuracy_report(random_forst_predictions)

===ACCURACY SCORE===
0.633816

===CONFUSION MATRIX===
[[  73  731]
 [  27 1239]]

===CLASSIFICATION REPORT===
              precision    recall  f1-score   support

   Cancelled       0.73      0.09      0.16       804
     Current       0.63      0.98      0.77      1266

    accuracy                           0.63      2070
   macro avg       0.68      0.53      0.46      2070
weighted avg       0.67      0.63      0.53      2070



### Cross-Validation on Random Forest Classifier for Text Data

In [15]:
random_forest_cross_val_score = cross_val_score(
    random_forest, 
    tf_idf_df_selected_features_df, 
    y_train, 
    cv=20, 
    scoring="balanced_accuracy"
)

cross_val_accuracy(random_forest_cross_val_score)

===ALL ACCURACY SCORES===
[0.53290747 0.49690283 0.52071235 0.52497096 0.509375   0.559375
 0.54375    0.53125    0.509375   0.5546875  0.54206349 0.52619048
 0.48869048 0.525      0.50119048 0.49662698 0.525      0.53869048
 0.48988095 0.50119048]

===MEAN ACCURACY SCORE===
0.520891


### Merge Original Customer Dataset and TF-IDF Selected Features

In [16]:
df = X_train.merge(tf_idf_df_selected_features_df, how ='inner', on='ID')

### Apply Encoding to Categorical Features

In [17]:
columns = df.keys()
encode_columns = []

In [18]:
for i in range(len(columns)):
    if type(df[columns[i]].iloc[0]) == str:
        encode_columns.append(columns[i])

In [19]:
encoded_df = pd.get_dummies(df, columns=encode_columns)
encoded_df.head()

Unnamed: 0_level_0,Children,Est_Income,Usage,Age,RatePlan,LongDistance,International,Local,Dropped,address,...,Status_S,Car_Owner_N,Car_Owner_Y,Paymethod_Auto,Paymethod_CC,Paymethod_CH,LocalBilltype_Budget,LocalBilltype_FreeLocal,LongDistanceBilltype_Intnl_discount,LongDistanceBilltype_Standard
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1,38000.0,229.64,24.393333,3,23.56,0.0,206.08,0,0.0,...,1,1,0,0,1,0,1,0,1,0
6,2,29616.0,75.29,49.426667,2,29.78,0.0,45.5,0,0.0,...,0,1,0,0,0,1,0,1,0,1
8,0,19732.8,47.25,50.673333,3,24.81,0.0,22.44,0,0.0,...,0,1,0,0,1,0,0,1,0,1
11,2,96.33,59.01,56.473333,1,26.13,0.0,32.88,1,0.0,...,1,1,0,0,1,0,1,0,0,1
14,2,52004.8,28.14,25.14,1,5.03,0.0,23.11,0,0.0,...,0,1,0,0,0,1,1,0,1,0


# Cast Random Forest Classifier on Full Dataset

In [20]:
total_random_forest = random_forest.fit(encoded_df, y_train)
total_random_forest_predictions = random_forest.predict(encoded_df)
accuracy_score(total_random_forest, encoded_df)
accuracy_report(total_random_forest_predictions)

===ACCURACY SCORE===
0.957971

===CONFUSION MATRIX===
[[ 760   44]
 [  43 1223]]

===CLASSIFICATION REPORT===
              precision    recall  f1-score   support

   Cancelled       0.95      0.95      0.95       804
     Current       0.97      0.97      0.97      1266

    accuracy                           0.96      2070
   macro avg       0.96      0.96      0.96      2070
weighted avg       0.96      0.96      0.96      2070



### Cross-Validation on Random Forest Classifier for Full Dataset

In [21]:
total_random_forest_cross_val_score = cross_val_score(
    total_random_forest, 
    encoded_df,
    y_train, 
    cv=20, 
    scoring="balanced_accuracy"
)

cross_val_accuracy(total_random_forest_cross_val_score)

===ALL ACCURACY SCORES===
[0.84436702 0.89198606 0.88288811 0.91153697 0.903125   0.921875
 0.8640625  0.821875   0.8796875  0.90625    0.93075397 0.90912698
 0.93075397 0.7968254  0.93075397 0.81031746 0.90238095 0.93075397
 0.90238095 0.91825397]

===MEAN ACCURACY SCORE===
0.889498


# Cast Random Forest Classifier on Dataset Without Text

In [22]:
no_text_df = encoded_df.drop(columns=tf_idf_df_selected_features_df.keys(), axis=1)
no_text_df.head()

Unnamed: 0_level_0,Children,Est_Income,Usage,Age,RatePlan,LongDistance,International,Local,Dropped,Sex_F,...,Status_S,Car_Owner_N,Car_Owner_Y,Paymethod_Auto,Paymethod_CC,Paymethod_CH,LocalBilltype_Budget,LocalBilltype_FreeLocal,LongDistanceBilltype_Intnl_discount,LongDistanceBilltype_Standard
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1,38000.0,229.64,24.393333,3,23.56,0.0,206.08,0,1,...,1,1,0,0,1,0,1,0,1,0
6,2,29616.0,75.29,49.426667,2,29.78,0.0,45.5,0,0,...,0,1,0,0,0,1,0,1,0,1
8,0,19732.8,47.25,50.673333,3,24.81,0.0,22.44,0,0,...,0,1,0,0,1,0,0,1,0,1
11,2,96.33,59.01,56.473333,1,26.13,0.0,32.88,1,0,...,1,1,0,0,1,0,1,0,0,1
14,2,52004.8,28.14,25.14,1,5.03,0.0,23.11,0,1,...,0,1,0,0,0,1,1,0,1,0


In [23]:
no_text_random_forest = random_forest.fit(no_text_df, y_train)
no_text_random_forest_predictions = random_forest.predict(no_text_df)
accuracy_score(no_text_random_forest, no_text_df)
accuracy_report(no_text_random_forest_predictions)

===ACCURACY SCORE===
0.957488

===CONFUSION MATRIX===
[[ 762   42]
 [  46 1220]]

===CLASSIFICATION REPORT===
              precision    recall  f1-score   support

   Cancelled       0.94      0.95      0.95       804
     Current       0.97      0.96      0.97      1266

    accuracy                           0.96      2070
   macro avg       0.95      0.96      0.96      2070
weighted avg       0.96      0.96      0.96      2070



### Cross-Validation on Random Forest Classifier for Dataset Without Text

In [24]:
no_text_random_forest_cross_val_score = cross_val_score(
    no_text_random_forest, 
    no_text_df,
    y_train, 
    cv=20, 
    scoring="balanced_accuracy"
)

cross_val_accuracy(no_text_random_forest_cross_val_score)

===ALL ACCURACY SCORES===
[0.84436702 0.87979094 0.89508324 0.91153697 0.903125   0.921875
 0.8515625  0.8109375  0.875      0.89375    0.91825397 0.90119048
 0.92281746 0.81269841 0.93075397 0.80238095 0.88988095 0.91825397
 0.88988095 0.91825397]

===MEAN ACCURACY SCORE===
0.88457


# Perform Feature Selection Using a Decision Tree Classifier

In [25]:
decision_tree = DecisionTreeClassifier()
decision_tree.fit(no_text_df, y_train)

model = SelectFromModel(decision_tree, prefit=True, max_features=7, threshold=-np.inf)
feature_idx = model.get_support()
feature_names = no_text_df.columns[feature_idx]

x = model.transform(no_text_df)
x_selected_features_df = pd.DataFrame(x, columns=feature_names).set_index(X_train.index)
x_selected_features_df

Unnamed: 0_level_0,Children,Est_Income,Age,RatePlan,LongDistance,Local,Status_S
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,1.0,38000.00,24.393333,3.0,23.56,206.08,1.0
6,2.0,29616.00,49.426667,2.0,29.78,45.50,0.0
8,0.0,19732.80,50.673333,3.0,24.81,22.44,0.0
11,2.0,96.33,56.473333,1.0,26.13,32.88,1.0
14,2.0,52004.80,25.140000,1.0,5.03,23.11,0.0
...,...,...,...,...,...,...,...
3821,0.0,78851.30,48.373333,4.0,0.37,28.66,1.0
3822,1.0,17540.70,62.786667,1.0,22.17,13.45,1.0
3823,0.0,83891.90,61.020000,4.0,28.92,45.47,0.0
3824,2.0,28220.80,38.766667,4.0,26.49,12.46,0.0


# Sequential Forward Search

In [26]:
sequential_forward_search = SFS(
    decision_tree, 
    k_features=7, 
    forward=True, 
    floating=False, 
    verbose=False,
    scoring='accuracy',
    cv=0)

sequential_forward_search.fit(no_text_df, y_train)

SequentialFeatureSelector(cv=0, estimator=DecisionTreeClassifier(),
                          k_features=7, scoring='accuracy', verbose=False)

### Sequential Forward Search Results

In [27]:
print("===SEQUENTIAL FORWARD SEARCH PARAMETERS===")
print(sequential_forward_search.k_feature_names_)
print("")
print("===SEQUENTIAL FORWARD SEARCH SCORE===")
print(sequential_forward_search.k_score_)

===SEQUENTIAL FORWARD SEARCH PARAMETERS===
('Children', 'Est_Income', 'Usage', 'Age', 'RatePlan', 'LongDistance', 'Paymethod_CC')

===SEQUENTIAL FORWARD SEARCH SCORE===
0.957487922705314
