# Loading Data

In [73]:
#Connect to google drive data
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [74]:
import pandas as pd
import numpy as np
import nltk
import re

In [75]:
df = pd.read_csv('/content/drive/MyDrive/Hackathon M2/Our Data/Customer Reviews - SKYTRAX.csv')

In [76]:
df.head(5)

Unnamed: 0,score,title,user_country,user_name,review_date,is_verified,review_body,airline
0,7.0,"""my trip was pretty good.""",United States,N Palustre,9th February 2022,0,My flight from Dulles to Paris was great. My f...,Air France
1,1.0,"""my baggage didn't make it""",Lebanon,Nour El Khoury,27th January 2022,1,"I went for 2 months to Nice, when I arrived i...",Air France
2,3.0,Air France customer review,United States,Daniel Rabourn,26th January 2022,1,"Yesterday Jan 25, we flew Colombo, Sri Lanka t...",Air France
3,8.0,"""great alternative to fly across the Atlantic""",United Kingdom,D Garlin,19th January 2022,1,First medium-haul experience with Air France....,Air France
4,8.0,"""A pleasurable experience!""",United States,S Gillemo,4th January 2022,1,BOD-CDG-ATL: Was having some major issue with...,Air France


In [77]:
# check the distribution
df['score'].value_counts()

1.0     2172
10.0    1390
9.0     1032
8.0      809
2.0      710
3.0      529
7.0      526
5.0      447
4.0      389
6.0      358
Name: score, dtype: int64

# Pre-Processing

## Full Reviews

In [78]:
# add title to review_body
df['full_review'] = df['title'] + df['review_body']

In [79]:
# new df with only relevant inputs for TF IDF + Classifier (might experiment with other variables later)
df_slim = df[['score', 'full_review']]

In [80]:
df_slim.head(5)

Unnamed: 0,score,full_review
0,7.0,"""my trip was pretty good.""My flight from Dulle..."
1,1.0,"""my baggage didn't make it"" I went for 2 month..."
2,3.0,"Air France customer reviewYesterday Jan 25, we..."
3,8.0,"""great alternative to fly across the Atlantic""..."
4,8.0,"""A pleasurable experience!"" BOD-CDG-ATL: Was h..."


## Labeling by score

In [81]:
# even reviews with score of 5 seem bad
df_slim[df_slim['score'] == 5]

Unnamed: 0,score,full_review
14,5.0,"""All in all, a decent experience"" It was very ..."
58,5.0,"""had the mask only on the chin"" Today as I boa..."
87,5.0,"""My disappointment is with the food service"" F..."
127,5.0,"""Overall impression: sloppy product"" Flight fr..."
160,5.0,"""most uncomfortable flight"" Mauritius to Paris..."
...,...,...
8296,5.0,"""will not be flying with Ryanair again"" Liverp..."
8298,5.0,"""not be using Ryanair transfer services"" Venic..."
8312,5.0,"""avoid this airline from now on"" Rome to Budap..."
8382,5.0,"""use again purely for price"" Flew from Stanste..."


In [82]:
# reviews with 6 are pretty generic, from seven is when they start being fairly positive
df_slim[df_slim['score'] == 6]

Unnamed: 0,score,full_review
16,6.0,"""Check-in was a mess"" Check-in was a mess. The..."
92,6.0,"""checked size and weight of cabin bags"" Paris ..."
124,6.0,"""My issue is with the baggage policy"" First, t..."
133,6.0,"""Very poor business class"" Bought a seat in bu..."
176,6.0,"""cabin looked very tired and in need of a refr..."
...,...,...
8324,6.0,"""should improve some parts"" Wroclaw to Warsaw...."
8348,6.0,"""I would fly with Ryanair again"" Rome to Stans..."
8369,6.0,"""attitude of that wonderful lady"" Stansted to ..."
8388,6.0,"""you cannot have high expectations"" Ryanair pr..."


In [83]:
# create binary label of reviews from scores giving on review
'''
If score is 7-10 is a good rating
6 is neutral
1-5 is bad

If score is 7-10 then 1 (labeled good)
if score is 6 these rows will be ignored
if 1-5 then 0 (labeled bad)
'''

# filter our reviews with 6
df_slimmer = df_slim[df_slim['score'] != 6]

# label reviews
labels = []
for row in df_slimmer['score']:
    if row <= 5:    labels.append(0) # bad reviews
    else:   labels.append(1) # good reviews

df_slimmer['label'] = labels


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


## Lemmatization

In [None]:
# lower case
df_slimmer['lower_case'] = df_slimmer['full_review'].apply(lambda x: ' '.join(x.lower() for x in str(x).split()))
# remove possible HTML tags and any URLS
from bs4 import BeautifulSoup
df_slimmer['pre_process']=df_slimmer['lower_case'].apply(lambda x: BeautifulSoup(x).get_text())
import re
df_slimmer['pre_process']=df_slimmer['pre_process'].apply(lambda x: re.sub(r"http\S+", '', x))
# contractions
def contractions(s):
 s = re.sub(r"won’t", "will not",s)
 s = re.sub(r"-", " to ",s) # for things like CDG-ATL
 s = re.sub(r"would’t", "would not",s)
 s = re.sub(r"could’t", "could not",s)
 s = re.sub(r"cannot", "can not",s)
 s = re.sub(r"\’d", " would",s)
 s = re.sub(r"can\’t", "can not",s)
 s = re.sub(r"n\’t", " not", s)
 s= re.sub(r"\’re", " are", s)
 s = re.sub(r"\’s", " is", s)
 s = re.sub(r"\’ll", " will", s)
 s = re.sub(r"\’t", " not", s)
 s = re.sub(r"\’ve", " have", s)
 s = re.sub(r"\’m", " am", s)
 return s

# replace contractions
df_slimmer['pre_process'] = df_slimmer['pre_process'].apply(lambda x:contractions(x))
# remove non-alphabet characters
nltk.download('punkt')
df_slimmer['pre_process'] = df_slimmer['pre_process'].apply(lambda x: " ".join([re.sub("[^A-Za-z]+","", x) for x in nltk.word_tokenize(x)]))
df_slimmer = df_slimmer[['score', 'label', 'full_review', 'pre_process']]
# remove extra spaces
df_slimmer['pre_process'] = df_slimmer['pre_process'].apply(lambda x: re.sub(' +', " ", x))
# remove stop words
nltk.download('stopwords')
from nltk.corpus import stopwords

stop = stopwords.words("english")
df_slimmer['pre_process'] = df_slimmer['pre_process'].apply(lambda x: " ".join([x for x in x.split() if x not in stop]))
# lemmatization
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
df_slimmer['pre_process'] = df_slimmer['pre_process'].apply(lambda x: " ".join([lemmatizer.lemmatize(w) for w in nltk.word_tokenize(x)]))
# pass over ready df
df_pp = df_slimmer[['score', 'label', 'pre_process']]

# TD IDF + Classifiers Feature Extraction

## TD IDF Vector/Matrix

In [88]:
# renaming columns to not get it twisted 
df_pp = df_pp.rename(columns={'pre_process': 'lemmatized'})

In [194]:
# TD IDF using scikitlearn
from sklearn.model_selection import train_test_split

# split data
X_train,X_test,Y_train, Y_test = train_test_split(df_pp['lemmatized'], df_pp['label'], test_size=0.25, random_state=30)
print("Train: ",X_train.shape,Y_train.shape, "Test: ",(X_test.shape,Y_test.shape))

Train:  (6048,) (6048,) Test:  ((2017,), (2017,))


In [195]:
# TD IDF vectorizer
print("TFIDF Vectorizer……")

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer= TfidfVectorizer()
tf_x_train = vectorizer.fit_transform(X_train)
tf_x_test = vectorizer.transform(X_test)
print(" done!")

TFIDF Vectorizer……
 done!


## SVM

In [165]:
# SVM
from sklearn.svm import LinearSVC

svm = LinearSVC(random_state=0)

In [166]:
# fit training data into the model
svm.fit(tf_x_train,Y_train)

LinearSVC(random_state=0)

In [168]:
# predict test data
y_test_pred = svm.predict(tf_x_test)


In [169]:
# analyze results
from sklearn.metrics import classification_report

report = classification_report(Y_test, y_test_pred, output_dict=True)

In [170]:
report

{'0': {'f1-score': 0.9298162976919454,
  'precision': 0.9293785310734464,
  'recall': 0.9302544769085768,
  'support': 1061},
 '1': {'f1-score': 0.9220303506017792,
  'precision': 0.9225130890052357,
  'recall': 0.9215481171548117,
  'support': 956},
 'accuracy': 0.9261279127416956,
 'macro avg': {'f1-score': 0.9259233241468623,
  'precision': 0.925945810039341,
  'recall': 0.9259012970316942,
  'support': 2017},
 'weighted avg': {'f1-score': 0.9261259826606122,
  'precision': 0.9261245089528666,
  'recall': 0.9261279127416956,
  'support': 2017}}

## Logistic Regression

In [101]:
# logistic regression
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(max_iter=1000,solver="saga")

In [102]:
# fit model to trainning data
clf.fit(tf_x_train,Y_train)

LogisticRegression(max_iter=1000, solver='saga')

In [103]:
# predicting the test data
y_test_pred = clf.predict(tf_x_test)

In [104]:
# analyze results
from sklearn.metrics import classification_report
report = classification_report(Y_test, y_test_pred, output_dict=True)

In [105]:
report

{'0': {'f1-score': 0.9247412982126059,
  'precision': 0.9230046948356807,
  'recall': 0.9264844486333648,
  'support': 1061},
 '1': {'f1-score': 0.9161425576519916,
  'precision': 0.9180672268907563,
  'recall': 0.9142259414225942,
  'support': 956},
 'accuracy': 0.9206742687159147,
 'macro avg': {'f1-score': 0.9204419279322987,
  'precision': 0.9205359608632184,
  'recall': 0.9203551950279795,
  'support': 2017},
 'weighted avg': {'f1-score': 0.9206657424486261,
  'precision': 0.9206644770095291,
  'recall': 0.9206742687159147,
  'support': 2017}}

## Random Forests

In [116]:
# Random Forests
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(max_depth=20, n_estimators=1000, random_state=0)

In [117]:
# fit model to trainning data
clf.fit(tf_x_train,Y_train)

RandomForestClassifier(max_depth=20, n_estimators=1000, random_state=0)

In [118]:
# predicting the test data
y_test_pred = clf.predict(tf_x_test)

In [119]:
# analyze results
from sklearn.metrics import classification_report
report = classification_report(Y_test, y_test_pred, output_dict=True)

In [120]:
report

{'0': {'f1-score': 0.886255924170616,
  'precision': 0.8913250714966635,
  'recall': 0.88124410933082,
  'support': 1061},
 '1': {'f1-score': 0.8752598752598754,
  'precision': 0.8698347107438017,
  'recall': 0.8807531380753139,
  'support': 956},
 'accuracy': 0.881011403073872,
 'macro avg': {'f1-score': 0.8807578997152457,
  'precision': 0.8805798911202326,
  'recall': 0.8809986237030669,
  'support': 2017},
 'weighted avg': {'f1-score': 0.8810441131846626,
  'precision': 0.881139258467543,
  'recall': 0.881011403073872,
  'support': 2017}}

### RF + GridSearch + CV

In [137]:
# initialize gridsearch model

from sklearn.model_selection import GridSearchCV
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [20, 30, 40, 50, 100],
    'max_features': [3, 4, 5],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [4, 8, 10],
    'n_estimators': [100, 500, 1000]
}
# Create a based model
rf = RandomForestClassifier()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = 5, verbose = True)

In [138]:
# Fit the grid search to the data
grid_search.fit(tf_x_train, Y_train)
grid_search.best_params_
# {'bootstrap': True,
#  'max_depth': 50,
#  'max_features': 5,
#  'min_samples_leaf': 3,
#  'min_samples_split': 4,
#  'n_estimators': 100}

Fitting 3 folds for each of 405 candidates, totalling 1215 fits
CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 6.68 µs


In [139]:
grid_search.best_params_

{'bootstrap': True,
 'max_depth': 50,
 'max_features': 5,
 'min_samples_leaf': 3,
 'min_samples_split': 4,
 'n_estimators': 100}

In [155]:
# Random Forests
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(max_depth=50, n_estimators=1000, random_state=0)

In [156]:
# fit model to trainning data
clf.fit(tf_x_train,Y_train)

RandomForestClassifier(max_depth=50, n_estimators=1000, random_state=0)

In [157]:
# predicting the test data
y_test_pred = clf.predict(tf_x_test)

In [158]:
# analyze results
from sklearn.metrics import classification_report
report = classification_report(Y_test, y_test_pred, output_dict=True)

In [159]:
report

{'0': {'f1-score': 0.893657606103958,
  'precision': 0.9044401544401545,
  'recall': 0.883129123468426,
  'support': 1061},
 '1': {'f1-score': 0.8848735157459989,
  'precision': 0.873598369011213,
  'recall': 0.8964435146443515,
  'support': 956},
 'accuracy': 0.8894397620228062,
 'macro avg': {'f1-score': 0.8892655609249784,
  'precision': 0.8890192617256838,
  'recall': 0.8897863190563887,
  'support': 2017},
 'weighted avg': {'f1-score': 0.8894941998658772,
  'precision': 0.8898220350201901,
  'recall': 0.8894397620228062,
  'support': 2017}}

## XGBoost

In [131]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from datetime import datetime
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from xgboost import XGBClassifier


# parameter grid
parameters = {'nthread':[4], #when use hyperthread, xgboost may become slower
              'objective':['binary:logistic'],
              'learning_rate': [0.01, 0.02, 0.03, 0.05, 0.1], #so called `eta` value
              'max_depth': [5, 6, 7, 8],
              'min_child_weight': [8, 10, 11],
              'silent': [1],
              'subsample': [0.8, 0.9],
              'colsample_bytree': [0.7],
              'n_estimators': [800, 900, 1000]
              }

xgb = XGBClassifier(learning_rate=0.02, n_estimators=600, objective='binary:logistic',
                    silent=True, nthread=1)

folds = 3
param_comb = 5

skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 1001)

random_search = RandomizedSearchCV(xgb, param_distributions=parameters, n_iter=param_comb, scoring='roc_auc', n_jobs=4, cv=skf.split(tf_x_train, Y_train), verbose=3, random_state=1001 )

# fit model with GS-CV
random_search.fit(tf_x_train, Y_train)
%time

CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 8.82 µs
Fitting 3 folds for each of 5 candidates, totalling 15 fits


RandomizedSearchCV(cv=<generator object _BaseKFold.split at 0x7fec178d6950>,
                   estimator=XGBClassifier(learning_rate=0.02, n_estimators=600,
                                           nthread=1, silent=True),
                   n_iter=5, n_jobs=4,
                   param_distributions={'colsample_bytree': [0.7],
                                        'learning_rate': [0.01, 0.02, 0.03,
                                                          0.05, 0.1],
                                        'max_depth': [5, 6, 7, 8],
                                        'min_child_weight': [8, 10, 11],
                                        'n_estimators': [800, 900, 1000],
                                        'nthread': [4],
                                        'objective': ['binary:logistic'],
                                        'silent': [1],
                                        'subsample': [0.8, 0.9]},
                   random_state=1001, scoring='roc_auc', 

In [132]:
print('\n All results:')
print(random_search.cv_results_)
print('\n Best estimator:')
print(random_search.best_estimator_)
print('\n Best normalized gini score for %d-fold search with %d parameter combinations:' % (folds, param_comb))
print(random_search.best_score_ * 2 - 1)
print('\n Best hyperparameters:')
print(random_search.best_params_)
results = pd.DataFrame(random_search.cv_results_)
results.to_csv('xgb-random-grid-search-results-01.csv', index=False)


 All results:
{'mean_fit_time': array([141.12503552, 195.3893737 , 201.83231362, 202.55778003,
       142.68639787]), 'std_fit_time': array([0.43400143, 0.63233834, 1.12968749, 0.57291537, 2.47800802]), 'mean_score_time': array([0.61430184, 0.96983918, 0.87197741, 0.87802625, 0.28128171]), 'std_score_time': array([0.10817371, 0.07958129, 0.0695021 , 0.0592604 , 0.13069639]), 'param_subsample': masked_array(data=[0.8, 0.9, 0.9, 0.8, 0.9],
             mask=[False, False, False, False, False],
       fill_value='?',
            dtype=object), 'param_silent': masked_array(data=[1, 1, 1, 1, 1],
             mask=[False, False, False, False, False],
       fill_value='?',
            dtype=object), 'param_objective': masked_array(data=['binary:logistic', 'binary:logistic',
                   'binary:logistic', 'binary:logistic',
                   'binary:logistic'],
             mask=[False, False, False, False, False],
       fill_value='?',
            dtype=object), 'param_nthread': ma

In [133]:
# predicting the test data
y_test_pred = random_search.predict(tf_x_test)

In [134]:
# analyze results
from sklearn.metrics import classification_report
report = classification_report(Y_test, y_test_pred, output_dict=True)

In [135]:
report

{'0': {'f1-score': 0.899526066350711,
  'precision': 0.9046711153479504,
  'recall': 0.8944392082940622,
  'support': 1061},
 '1': {'f1-score': 0.8898128898128899,
  'precision': 0.8842975206611571,
  'recall': 0.895397489539749,
  'support': 956},
 'accuracy': 0.894893406048587,
 'macro avg': {'f1-score': 0.8946694780818004,
  'precision': 0.8944843180045537,
  'recall': 0.8949183489169056,
  'support': 2017},
 'weighted avg': {'f1-score': 0.8949222999797853,
  'precision': 0.8950146173208932,
  'recall': 0.894893406048587,
  'support': 2017}}

## Best Model

In [172]:
import pickle
# the best model is:
best_model = svm
# save the model
filename = 'best_classifier.sav'
pickle.dump(best_model, open(filename, 'wb'))

# Predict for all other entries using best model

## Turn other comments into their TF IDF Forms

In [202]:
df_full = pd.read_csv('/content/drive/MyDrive/Hackathon M2/Our Data/Merged_data_without_date.csv')

In [203]:
df_full.tail(5)

Unnamed: 0.2,Unnamed: 0,score,title,user_country,user_name,is_verified,review_body,airline,seat,aircraft,layout,Unnamed: 0.1,Cabin Staff Service,Food & Beverages,Ground Service,Recommended,Route,Seat Comfort,Seat Type,Type Of Traveller,Value For Money,Inflight Entertainment,Wifi & Connectivity,link
34387,34387,,,,SeatGuru,,This is a terrible aircraft. Upon entering the...,Westjet_,20B,Westjet_Bombardier_Q400,,16746.0,,,,,,,,,,,,https://www.seatguru.com//airlines/Westjet/Wes...
34388,34388,,,,SeatGuru,,While these seats may be at the front of the p...,Westjet_,1B,Westjet_Bombardier_Q400,,16747.0,,,,,,,,,,,,https://www.seatguru.com//airlines/Westjet/Wes...
34389,34389,,,,SeatGuru,,Rather comfy despite it being listed at 17 inc...,Xiamen_Airlines_,,Xiamen_Airlines_Boeing_787-8,,16748.0,,,,,,,,,,,,https://www.seatguru.com//airlines/Xiamen_Airl...
34390,34390,,,,SeatGuru,,I was very satisfied with my flight. The Fligh...,Xiamen_Airlines_,1A,Xiamen_Airlines_Boeing_787-9,,16749.0,,,,,,,,,,,,https://www.seatguru.com//airlines/Xiamen_Airl...
34391,34391,,,,SeatGuru,,AMS-XIA in May 2018 (codeshare KLM-Xiamenair)....,Xiamen_Airlines_,14A,Xiamen_Airlines_Boeing_787-9,,16750.0,,,,,,,,,,,,https://www.seatguru.com//airlines/Xiamen_Airl...


## Pre-Processing

### Full Reviews

In [207]:
# add title to review_body
c_w_t = []
for i, row in enumerate(df_full['title']):
  if row != np.nan:
    c_w_t.append(str(row) + str(df_full['review_body'].iloc[i]))
  else:
    c_w_t.append(df_full['review_body'].iloc[i])

In [210]:
df_full['full_review'] = c_w_t

In [212]:
# new df with only relevant inputs for TF IDF + Classifier (might experiment with other variables later)
df_slim = df_full[['score', 'full_review']]

### Labeling by score

In [213]:
# create binary label of reviews from scores giving on review
'''
If score is 7-10 is a good rating
6 is neutral
1-5 is bad

If score is 7-10 then 1 (labeled good)
if score is 6 these rows will be ignored
if 1-5 then 0 (labeled bad)
'''

# filter our reviews with 6
df_slimmer = df_slim[df_slim['score'] != 6]

# label reviews
labels = []
for row in df_slimmer['score']:
    if (row <= 5):    labels.append(0) # bad reviews
    else:   labels.append(1) # good reviews

df_slimmer['label'] = labels


### Lemmatization

In [215]:
# lower case
df_slimmer['lower_case'] = df_slimmer['full_review'].apply(lambda x: ' '.join(x.lower() for x in str(x).split()))
# remove possible HTML tags and any URLS
from bs4 import BeautifulSoup
df_slimmer['pre_process']=df_slimmer['lower_case'].apply(lambda x: BeautifulSoup(x).get_text())
import re
df_slimmer['pre_process']=df_slimmer['pre_process'].apply(lambda x: re.sub(r"http\S+", '', x))
# contractions
def contractions(s):
 s = re.sub(r"won’t", "will not",s)
 s = re.sub(r"-", " to ",s) # for things like CDG-ATL
 s = re.sub(r"would’t", "would not",s)
 s = re.sub(r"nan", "",s)
 s = re.sub(r"could’t", "could not",s)
 s = re.sub(r"cannot", "can not",s)
 s = re.sub(r"\’d", " would",s)
 s = re.sub(r"can\’t", "can not",s)
 s = re.sub(r"n\’t", " not", s)
 s= re.sub(r"\’re", " are", s)
 s = re.sub(r"\’s", " is", s)
 s = re.sub(r"\’ll", " will", s)
 s = re.sub(r"\’t", " not", s)
 s = re.sub(r"\’ve", " have", s)
 s = re.sub(r"\’m", " am", s)
 return s

# replace contractions
df_slimmer['pre_process'] = df_slimmer['pre_process'].apply(lambda x:contractions(x))
# remove non-alphabet characters
nltk.download('punkt')
df_slimmer['pre_process'] = df_slimmer['pre_process'].apply(lambda x: " ".join([re.sub("[^A-Za-z]+","", x) for x in nltk.word_tokenize(x)]))
df_slimmer = df_slimmer[['score', 'label', 'full_review', 'pre_process']]
# remove extra spaces
df_slimmer['pre_process'] = df_slimmer['pre_process'].apply(lambda x: re.sub(' +', " ", x))
# remove stop words
nltk.download('stopwords')
from nltk.corpus import stopwords

stop = stopwords.words("english")
df_slimmer['pre_process'] = df_slimmer['pre_process'].apply(lambda x: " ".join([x for x in x.split() if x not in stop]))
# lemmatization
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
df_slimmer['pre_process'] = df_slimmer['pre_process'].apply(lambda x: " ".join([lemmatizer.lemmatize(w) for w in nltk.word_tokenize(x)]))
# pass over ready df
df_full_pp = df_slimmer[['score', 'label', 'pre_process']]

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [216]:
# renaming columns to not get it twisted 
df_full_pp = df_full_pp.rename(columns={'pre_process': 'lemmatized'})

### TF IDF of all comments

In [218]:
# TD IDF vectorizer
print("TFIDF Vectorizer……")

from sklearn.feature_extraction.text import TfidfVectorizer

# using previous vectorizer
tf_full = vectorizer.transform(df_full_pp['lemmatized'])
print(" done!")

TFIDF Vectorizer……
 done!


### Predict Labels with best classifier (SVM)

In [219]:
# predict labels for all data
pred_full = svm.predict(tf_full)

In [220]:
len(pred_full)

33995

### Add Labels to DF

In [221]:
# add predictions to dataset
df_full_pp['predicted_labels'] = pred_full

In [223]:
df_full_pp['original_review'] = df_slimmer['full_review']

In [224]:
df_with_clf_predictions = df_full_pp

In [226]:
df_with_clf_predictions

Unnamed: 0,score,label,lemmatized,predicted_labels,original_review
0,7.0,1,trip pretty good flight dulles paris great fli...,1,"""my trip was pretty good.""My flight from Dulle..."
1,1.0,0,baggage nt make went month nice arrived nice b...,0,"""my baggage didn't make it"" I went for 2 month..."
2,3.0,0,air france customer reviewyesterday jan flew c...,0,"Air France customer reviewYesterday Jan 25, we..."
3,8.0,1,great alternative fly across atlantic first me...,1,"""great alternative to fly across the Atlantic""..."
4,8.0,1,pleasurable experience bod cdg atl major issue...,1,"""A pleasurable experience!"" BOD-CDG-ATL: Was h..."
...,...,...,...,...,...
34387,,1,terrible aircraft upon entering aircraft bange...,0,nanThis is a terrible aircraft. Upon entering ...
34388,,1,seat may front plane get get quite chilly righ...,1,nanWhile these seats may be at the front of th...
34389,,1,rather comfy despite listed inch wide legroom ...,1,nanRather comfy despite it being listed at 17 ...
34390,,1,satisfied flight flight attendant would every ...,1,nanI was very satisfied with my flight. The Fl...


### Save DF

In [229]:
# Save df as csv
df_with_clf_predictions.to_csv('df_with_clf_predictions.csv')