# Individual Assignment #4

### Import Required Libraries

In [1]:
# Read and Write Files
import pandas as pd

# Ensembles and Classifiers
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier

# Feature Selection / Extraction
from sklearn.feature_selection import chi2, SelectKBest, SelectFromModel
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

# Natural Language Toolkit
import nltk
#nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer

# Accuracy Metrics
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, classification_report

# Miscellaneous
import warnings
warnings.filterwarnings("ignore")

### Load in Datasets, Construct Training Set

In [2]:
customer_information_df = pd.read_csv("data/assignment_four/Customers.csv").set_index('ID')
comment_df = pd.read_csv("data/assignment_four/Comments.csv").set_index('ID')

In [3]:
customer_information_df.head()

Unnamed: 0_level_0,Sex,Status,Children,Est_Income,Car_Owner,Usage,Age,RatePlan,LongDistance,International,Local,Dropped,Paymethod,LocalBilltype,LongDistanceBilltype,TARGET
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1,F,S,1,38000.0,N,229.64,24.393333,3,23.56,0.0,206.08,0,CC,Budget,Intnl_discount,Cancelled
6,M,M,2,29616.0,N,75.29,49.426667,2,29.78,0.0,45.5,0,CH,FreeLocal,Standard,Current
8,M,M,0,19732.8,N,47.25,50.673333,3,24.81,0.0,22.44,0,CC,FreeLocal,Standard,Current
11,M,S,2,96.33,N,59.01,56.473333,1,26.13,0.0,32.88,1,CC,Budget,Standard,Current
14,F,M,2,52004.8,N,28.14,25.14,1,5.03,0.0,23.11,0,CH,Budget,Intnl_discount,Cancelled


In [4]:
X_train = customer_information_df.drop('TARGET', axis=1)
y_train = customer_information_df['TARGET']

### Tokenize Data

In [5]:
comment_df['tokenized_comments'] = comment_df['Comments'].apply(word_tokenize)
comment_df.head()

Unnamed: 0_level_0,Comments,tokenized_comments
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
1309,Does not like the way the phone works. It is t...,"[Does, not, like, the, way, the, phone, works,..."
3556,Wanted to know the nearest store location. Wan...,"[Wanted, to, know, the, nearest, store, locati..."
2230,Wants to know how to do text messaging. Referr...,"[Wants, to, know, how, to, do, text, messaging..."
2312,Asked how to disable call waiting. referred hi...,"[Asked, how, to, disable, call, waiting, ., re..."
3327,Needs help learning how to use the phone. I su...,"[Needs, help, learning, how, to, use, the, pho..."


### Apply English Snowball Stemmer

In [6]:
stemmer = SnowballStemmer("english")

In [7]:
snowball_df = pd.DataFrame()
snowball_df['stem_token_comments'] = comment_df['tokenized_comments'].apply(lambda x: [stemmer.stem(y) for y in x]).apply(lambda x: " ".join(x))
snowball_df.head()

Unnamed: 0_level_0,stem_token_comments
ID,Unnamed: 1_level_1
1309,doe not like the way the phone work . it is to...
3556,want to know the nearest store locat . want to...
2230,want to know how to do text messag . refer him...
2312,ask how to disabl call wait . refer him to web...
3327,need help learn how to use the phone . i sugge...


### Apply Count Vectorization

In [8]:
count_vectorizer = CountVectorizer(stop_words='english',lowercase=False)
term_document_counts = count_vectorizer.fit_transform(snowball_df['stem_token_comments'])
term_document_matrix = pd.DataFrame(term_document_counts.toarray(), columns=count_vectorizer.get_feature_names()).set_index(snowball_df.index)

### Compute Term Frequency–Inverse Document Frequency (TF-IDF) Matrix

In [9]:
tf_idf_transformer = TfidfTransformer()
tf_idf_X_train = tf_idf_transformer.fit_transform(term_document_counts)
tf_idf_df = pd.DataFrame(tf_idf_X_train.toarray(), columns=count_vectorizer.get_feature_names()).set_index(snowball_df.index)
tf_idf_df.head()

Unnamed: 0_level_0,3399,3g,abysm,access,accessori,adapt,add,addit,additon,address,...,wish,wll,wold,work,wors,worst,wrong,xvyx,year,york
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1309,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.209678,0.0,0.0,0.0,0.0,0.0,0.0
3556,0.0,0.0,0.0,0.0,0.27568,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2230,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2312,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3327,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Perform Feature Selection

In [30]:
selector = SelectKBest(score_func=chi2, k=50)
selector.fit_transform(tf_idf_df, y_train)
cols = selector.get_support(indices=True)

tf_idf_df_selected_features_df = tf_idf_df.iloc[:,cols]
tf_idf_df_selected_features_df.tail()

Unnamed: 0_level_0,address,adress,alway,bateri,chang,charg,charger,compar,complain,continu,...,sold,teach,tire,transeff,transfer,trust,turn,unlimit,weak,whi
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3034,0.0,0.0,0.0,0.0,0.0,0.446161,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
271,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
783,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1295,0.772949,0.0,0.0,0.0,0.545354,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1807,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [40]:
def accuracy_score(classifier):
    print(f"Accuracy Score: {round(classifier.score(tf_idf_df_selected_features_df, y_train),6)}.")

In [41]:
def accuracy_report(predictions):
    print("===CONFUSION MATRIX===")
    print(confusion_matrix(y_train, predictions))
    print("===CLASSIFICATION REPORT===")
    print(classification_report(y_train, predictions))

# Cast Random Forest Classifier on Text Data

In [None]:
random_forest = RandomForestClassifier()
random_forest_text = random_forest.fit(tf_idf_df_selected_features_df, y_train)
accuracy_score(random_forest)
rf_predictions = random_forest.predict(tf_idf_df_selected_features_df)
print("Confusion Matrix:")
print(confusion_matrix(y_train, rf_predictions))
print("Classification Report")
print(classification_report(y_train, rf_predictions))

<function __main__.accuracy_report()>