In [None]:
#Importing important libraries
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel


#NLTK-------------------------------
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.lancaster import LancasterStemmer

# Import libraries for feature 
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2


from sklearn.ensemble import GradientBoostingClassifier 
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix,classification_report
from sklearn import metrics
from sklearn.model_selection import cross_val_score

import warnings
warnings.filterwarnings("ignore")



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\tntnt\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
#Read files
textfile = r'C:\Users\tntnt\Downloads\Comments (1).csv'
textData = pd.read_csv(textfile) #creates a dataframe

CustInfofile = r'C:\Users\tntnt\Downloads\Customers (1).csv'
CustInfoData = pd.read_csv(CustInfofile)  #creates a dataframe

print(textData.shape)
print(CustInfoData.shape)


(2070, 2)
(2070, 17)


In [None]:
#Extract target column from Customer Info file
y_train = CustInfoData["TARGET"]
X_train = CustInfoData.drop(columns=["TARGET"]) #extracting training data without the target column
                     
print(X_train.shape)
print(X_train.head())
print(textData.shape)
print(textData.head())
print(y_train)

(2070, 16)
   ID Sex Status  Children  Est_Income Car_Owner   Usage        Age  RatePlan  \
0   1   F      S         1    38000.00         N  229.64  24.393333         3   
1   6   M      M         2    29616.00         N   75.29  49.426667         2   
2   8   M      M         0    19732.80         N   47.25  50.673333         3   
3  11   M      S         2       96.33         N   59.01  56.473333         1   
4  14   F      M         2    52004.80         N   28.14  25.140000         1   

   LongDistance  International   Local  Dropped Paymethod LocalBilltype  \
0         23.56            0.0  206.08        0        CC        Budget   
1         29.78            0.0   45.50        0        CH     FreeLocal   
2         24.81            0.0   22.44        0        CC     FreeLocal   
3         26.13            0.0   32.88        1        CC        Budget   
4          5.03            0.0   23.11        0        CH        Budget   

  LongDistanceBilltype  
0       Intnl_discount  
1

In [None]:
#Tokenize - Split the sentences to lists of words
textData['CommentsTokenized'] = textData['Comments'].apply(word_tokenize)

export_csv = textData.to_csv(r'C:\Users\tntnt\Downloads\TextDataTokenized.csv')

In [None]:
# Use English stemmer.
#Snowball stemmer
stemmer = SnowballStemmer("english")

#Now do stemming - create a new dataframe to store stemmed version
newTextData=pd.DataFrame()
newTextData=textData.drop(columns=["CommentsTokenized","Comments"])
newTextData['CommentsTokenizedStemmed'] = textData['CommentsTokenized'].apply(lambda x: [stemmer.stem(y) for y in x]) # Stem every word.

export_csv = newTextData.to_csv(r'C:\Users\tntnt\Downloads\Stemmer.csv')


In [None]:
# Use English stemmer to find the words that are original
#Porter stemmer
stemmer = PorterStemmer()

#Now do stemming - create a new dataframe to store stemmed version
newTextData=pd.DataFrame()
newTextData=textData.drop(columns=["CommentsTokenized","Comments"])
newTextData['CommentsTokenizedStemmed'] = textData['CommentsTokenized'].apply(lambda x: [stemmer.stem(y) for y in x]) # Stem every word.

export_csv = newTextData.to_csv(r'C:\Users\tntnt\Downloads\Porter.csv')


In [None]:
# Use English stemmer.
#Lancaster stemmer
stemmer = LancasterStemmer()

#Now do stemming - create a new dataframe to store stemmed version
newTextData=pd.DataFrame()
newTextData=textData.drop(columns=["CommentsTokenized","Comments"])
newTextData['CommentsTokenizedStemmed'] = textData['CommentsTokenized'].apply(lambda x: [stemmer.stem(y) for y in x]) # Stem every word.

export_csv = newTextData.to_csv(r'C:\Users\tntnt\Downloads\Lancaster.csv')


In [None]:
#Join stemmed strings and exported the files
newTextData['CommentsTokenizedStemmed'] = newTextData['CommentsTokenizedStemmed'].apply(lambda x: " ".join(x))

export_csv = newTextData.to_csv(r'C:\Users\tntnt\Downloads\NewTextDataJoined.csv')

In [None]:
#Done Bag-Of-Words model  
#Done Term Document Matrix by eliminating the stop words
#Learn the vocabulary dictionary and return term-document matrix.
#count_vect = CountVectorizer(stop_words=None)
count_vect = CountVectorizer(stop_words='english',lowercase=False)
TD_counts = count_vect.fit_transform(newTextData.CommentsTokenizedStemmed)
print(TD_counts.shape)
#print(TD_counts.dtype)
#print(count_vect.get_feature_names())
#print(TD_counts)
DF_TD_Counts=pd.DataFrame(TD_counts.toarray())
#print(DF_TD_Counts)
export_csv = DF_TD_Counts.to_csv(r'C:\Users\tntnt\Downloads\TD-counts-TokenizedStemmed.csv')


(2070, 364)


In [None]:
#Computing TF-IDF Matrix to assign numerical  to the data
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(TD_counts)
print(X_train_tfidf.shape)
DF_TF_IDF=pd.DataFrame(X_train_tfidf.toarray())

export_csv= DF_TF_IDF.to_csv(r'C:\Users\tntnt\Downloads\TFIDF-counts-TokenizedStemmed.csv')


(2070, 364)


In [None]:
#merge files - filter type
DF_TF_IDF['ID'] = textData['ID']
combined = pd.merge(X_train, DF_TF_IDF, on ='ID')
print(combined.shape)
combined.head()
export_csv= combined.to_csv(r'C:\Users\tntnt\Downloads\combined.csv')

(2070, 380)


In [None]:
#Do one Hot encoding for categorical features - filter type
X_cat = ["Sex","Status","Car_Owner","Paymethod","LocalBilltype","LongDistanceBilltype"]
#X_cat = combined.select_dtypes(exclude=['int','float64'])
print(X_cat)
combined_one_hot = pd.get_dummies(combined,columns=X_cat)
print(combined_one_hot.shape)
export_csv= combined_one_hot.to_csv(r'C:\Users\tntnt\Downloads\OneHotEncoding.csv')

['Sex', 'Status', 'Car_Owner', 'Paymethod', 'LocalBilltype', 'LongDistanceBilltype']
(2070, 388)


In [None]:
#Feature selection - using filter method
#Suppose, we select 50 features with top 50 Fisher scores
selector = SelectKBest(k=25) #doing for 25 best features
#selector = SelectKBest(score_func=chi2, k=25)

#new_DF_TF_IDF = SelectKBest(score_func=chi2, k=50).fit_transform(DF_TF_IDF,y_train)
new_DF_TF_IDF = selector.fit_transform(DF_TF_IDF,y_train)
print(new_DF_TF_IDF.shape)

feature_names_out = selector.get_support(indices=True)
print(feature_names_out)

DF_TF_IDF_SelectedFeatures= pd.DataFrame(new_DF_TF_IDF)
#print(DF_TF_IDF_SelectedFeatures)

export_csv= DF_TF_IDF_SelectedFeatures.to_csv(r'C:\Users\tntnt\Downloads\TFIDF-25.csv')


(2070, 25)
[ 15  49  61  69  79 101 115 118 121 175 187 191 206 214 216 221 226 248
 249 260 272 304 313 319 326]


In [None]:
#Feature selection - filter type
#Suppose, we select 50 features with top 50 Fisher scores
selector = SelectKBest(k=50)
#selector = SelectKBest(score_func=chi2, k=25)

#new_DF_TF_IDF = SelectKBest(score_func=chi2, k=50).fit_transform(DF_TF_IDF,y_train)
new_DF_TF_IDF = selector.fit_transform(DF_TF_IDF,y_train)
print(new_DF_TF_IDF.shape)

feature_names_out = selector.get_support(indices=True)
print(feature_names_out)

DF_TF_IDF_SelectedFeatures= pd.DataFrame(new_DF_TF_IDF)
#print(DF_TF_IDF_SelectedFeatures)

export_csv= DF_TF_IDF_SelectedFeatures.to_csv(r'C:\Users\tntnt\Downloads\TFIDF-50.csv')


(2070, 50)
[  0  15  17  36  48  49  61  63  69  71  79  97 101 109 115 118 121 130
 141 148 160 161 175 187 191 196 206 214 216 221 225 226 237 244 248 249
 254 260 265 272 304 313 318 319 321 326 330 332 342 359]


In [None]:
#Feature selection - filter type
#Suppose, we select 50 features with top 50 Fisher scores
selector = SelectKBest(k=75)
#selector = SelectKBest(score_func=chi2, k=75)

#new_DF_TF_IDF = SelectKBest(score_func=chi2, k=50).fit_transform(DF_TF_IDF,y_train)
new_DF_TF_IDF = selector.fit_transform(DF_TF_IDF,y_train)
print(new_DF_TF_IDF.shape)

feature_names_out = selector.get_support(indices=True)
print(feature_names_out)

DF_TF_IDF_SelectedFeatures= pd.DataFrame(new_DF_TF_IDF)
#print(DF_TF_IDF_SelectedFeatures)

export_csv= DF_TF_IDF_SelectedFeatures.to_csv(r'C:\Users\tntnt\Downloads\TFIDF-75.csv')

(2070, 75)
[  0  15  17  20  36  48  49  50  61  63  69  71  72  76  79  86  97 101
 108 109 115 118 120 121 126 130 141 148 159 160 161 171 175 187 191 195
 196 206 214 216 221 225 226 237 240 244 245 248 249 254 256 260 263 265
 271 272 273 277 278 287 298 304 313 318 319 321 324 326 330 332 337 342
 350 359 361]


In [None]:
#Feature selection - filter type
#Suppose, we select 50 features with top 50 Fisher scores
selector = SelectKBest(k=10)
#selector = SelectKBest(score_func=chi2, k=75)

#new_DF_TF_IDF = SelectKBest(score_func=chi2, k=50).fit_transform(DF_TF_IDF,y_train)
new_DF_TF_IDF = selector.fit_transform(DF_TF_IDF,y_train)
print(new_DF_TF_IDF.shape)

feature_names_out = selector.get_support(indices=True)
print(feature_names_out)

DF_TF_IDF_SelectedFeatures= pd.DataFrame(new_DF_TF_IDF)
#print(DF_TF_IDF_SelectedFeatures)

export_csv= DF_TF_IDF_SelectedFeatures.to_csv(r'C:\Users\tntnt\Downloads\TFIDF-10.csv')

(2070, 10)
[ 15  69 121 187 214 221 249 272 319 326]


In [None]:
#Using wrapper method 
#Do feature selection using GBM
#clf = ExtraTreesClassifier(n_estimators=50)
clf = GradientBoostingClassifier(n_estimators=10)
#clf = DecisionTreeClassifier()
clf = clf.fit(combined_one_hot,y_train)
#print(clf.feature_importances_)
#model = SelectFromModel(clf, prefit=True)
model = SelectFromModel(clf, prefit=True, max_features=7, threshold=-np.inf)
model = SelectFromModel(clf, prefit=True)
X_new= model.transform(combined_one_hot)
X_new_SelectedFeatures= pd.DataFrame(X_new)

print(X_new_SelectedFeatures)
print(X_new_SelectedFeatures.shape)
print(X_new_SelectedFeatures.head())
export_csv= X_new_SelectedFeatures.to_csv(r'C:\Users\tntnt\Downloads\GBM.csv')

       0         1          2      3     4       5         6    7         8   \
0     1.0  38000.00  24.393333  23.56  0.00  206.08  0.000000  0.0  0.000000   
1     2.0  29616.00  49.426667  29.78  0.00   45.50  0.000000  0.0  0.241722   
2     0.0  19732.80  50.673333  24.81  0.00   22.44  0.000000  0.0  0.241722   
3     2.0     96.33  56.473333  26.13  0.00   32.88  0.000000  0.0  0.241722   
4     2.0  52004.80  25.140000   5.03  0.00   23.11  0.000000  0.0  0.241722   
...   ...       ...        ...    ...   ...     ...       ...  ...       ...   
2065  0.0  78851.30  48.373333   0.37  0.00   28.66  0.387706  0.0  0.219662   
2066  1.0  17540.70  62.786667  22.17  0.57   13.45  0.387706  0.0  0.219662   
2067  0.0  83891.90  61.020000  28.92  0.00   45.47  0.387706  0.0  0.219662   
2068  2.0  28220.80  38.766667  26.49  0.00   12.46  0.387706  0.0  0.219662   
2069  0.0  28589.10  15.600000  13.19  0.00   87.09  0.387706  0.0  0.219662   

            9         10   11   12     

In [None]:
print(model.get_support())

#Get column names
cols = model.get_support(indices=True) #get column indices
print("\n cols = ", cols, "\n")

[False  True  True False  True False  True  True  True False False False
 False False False False False False False False False False False False
 False False False False False False False False  True False False False
 False False False False False False False False False False False False
 False False False  True False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False Fa

In [None]:
#Using wrapper method for feature selection 
#Constructing Randon forest classifier
clf=RandomForestClassifier()
RF_text = clf.fit(DF_TF_IDF_SelectedFeatures,y_train)
model = SelectFromModel(clf, prefit=True, max_features=20, threshold=-np.inf)
#print("Accuracy score (training): {0:.6f}".format(clf.score(DF_TF_IDF_SelectedFeatures, y_train)))
rf_predictions = clf.predict(DF_TF_IDF_SelectedFeatures)
print("Confusion Matrix:")
print(confusion_matrix(y_train, rf_predictions))
print("Classification Report")
print(classification_report(y_train, rf_predictions))

Confusion Matrix:
[[  35  769]
 [  10 1256]]
Classification Report
              precision    recall  f1-score   support

   Cancelled       0.78      0.04      0.08       804
     Current       0.62      0.99      0.76      1266

    accuracy                           0.62      2070
   macro avg       0.70      0.52      0.42      2070
weighted avg       0.68      0.62      0.50      2070



In [None]:
print(model.get_support())

#Get column names
cols = model.get_support(indices=True) #get column indices
print("\n cols = ", cols, "\n")

[ True  True  True  True  True  True  True  True  True  True]

 cols =  [0 1 2 3 4 5 6 7 8 9] 



In [None]:
dataframe = pd.DataFrame(combined_one_hot)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
Y_Train = pd.DataFrame(y_train)

In [None]:
new_data = pd.concat([dataframe,Y_Train])

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(dataframe, Y_Train, test_size = .20, random_state = 1)
print(X_train.shape)
print(Y_train.shape)
print(X_test.shape)
print(Y_test.shape)

(1656, 388)
(1656, 1)
(414, 388)
(414, 1)


In [None]:
#do feature selection
rf = RandomForestClassifier()
rf.fit(X_train, Y_train)
features_df = X_test.iloc[:]
print(features_df)

rf_predict = rf.predict(features_df)
print("accuracy".format(rf.score(features_df,Y_test)))
print(confusion_matrix(Y_test,rf_predict))
print(classification_report(Y_test,rf_predict))
rf_cv_score = cross_val_score(rf,X_train,Y_train, cv=5, scoring = "accuracy")
print(rf_cv_score)
print(rf_cv_score.mean())

        ID  Children  Est_Income   Usage        Age  RatePlan  LongDistance  \
724   1318         1    33084.30  125.05  50.313333         1         19.77   
348    670         2     1406.05   26.72  19.306667         3         11.55   
102    224         2    69343.30   21.91  42.166667         2          6.36   
1080  1967         0    30000.00  265.78  23.000000         1         45.00   
1758  3228         0    15924.20    1.46  14.460000         2          0.00   
...    ...       ...         ...     ...        ...       ...           ...   
264    492         1     8073.11   89.05  43.000000         2         28.70   
1071  1955         1     3960.50   96.13  14.693333         2         29.78   
1273  2308         1     7545.96  200.75  16.753333         1         22.39   
1438  2620         0    78851.30   29.04  48.373333         1          0.37   
1363  2499         2    76289.20  104.63  48.206667         3         22.20   

      International   Local  Dropped  ...  Status_S

In [None]:
#Doing 80 20 train test splitX_train, X_test, Y_train, Y_test = train_test_split(dataframe, Y_Train, test_size = .20, random_state = 1)
print(X_train.shape)
print(Y_train.shape)
print(X_test.shape)
print(Y_test.shape)

(1656, 388)
(1656, 1)
(414, 388)
(414, 1)


In [None]:
#do feature selection
gbc = GradientBoostingClassifier()
gbc.fit(X_train, Y_train)
features_df = X_test.iloc[:]
print(features_df)

gbc_predict = gbc.predict(features_df)
print("accuracy".format(gbc.score(features_df,Y_test)))
print(confusion_matrix(Y_test,gbc_predict))
print(classification_report(Y_test,gbc_predict))
gbc_cv_score = cross_val_score(rf,X_train,Y_train, cv=5, scoring = "accuracy")
print(gbc_cv_score)
print(gbc_cv_score.mean())

        ID  Children  Est_Income   Usage        Age  RatePlan  LongDistance  \
724   1318         1    33084.30  125.05  50.313333         1         19.77   
348    670         2     1406.05   26.72  19.306667         3         11.55   
102    224         2    69343.30   21.91  42.166667         2          6.36   
1080  1967         0    30000.00  265.78  23.000000         1         45.00   
1758  3228         0    15924.20    1.46  14.460000         2          0.00   
...    ...       ...         ...     ...        ...       ...           ...   
264    492         1     8073.11   89.05  43.000000         2         28.70   
1071  1955         1     3960.50   96.13  14.693333         2         29.78   
1273  2308         1     7545.96  200.75  16.753333         1         22.39   
1438  2620         0    78851.30   29.04  48.373333         1          0.37   
1363  2499         2    76289.20  104.63  48.206667         3         22.20   

      International   Local  Dropped  ...  Status_S