In [55]:
import pandas as pd
#read files and look at train dataset
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

train.head()

Unnamed: 0,User_ID,Description,Browser_Used,Device_Used,Is_Response
0,id10326,The room was kind of clean but had a VERY stro...,Edge,Mobile,not happy
1,id10327,I stayed at the Crown Plaza April -- - April -...,Internet Explorer,Mobile,not happy
2,id10328,I booked this hotel through Hotwire at the low...,Mozilla,Tablet,not happy
3,id10329,Stayed here with husband and sons on the way t...,InternetExplorer,Desktop,happy
4,id10330,My girlfriends and I stayed here to celebrate ...,Edge,Tablet,not happy


In [56]:
test.shape

(29404, 4)

In [57]:
train.shape

(38932, 5)

In [58]:
train['Is_Response'].value_counts()
#There is significant imbalance

happy        26521
not happy    12411
Name: Is_Response, dtype: int64

In [59]:
test.head()

Unnamed: 0,User_ID,Description,Browser_Used,Device_Used
0,id80132,Looking for a motel in close proximity to TV t...,Firefox,Mobile
1,id80133,Walking distance to Madison Square Garden and ...,InternetExplorer,Desktop
2,id80134,Visited Seattle on business. Spent - nights in...,IE,Tablet
3,id80135,This hotel location is excellent and the rooms...,Edge,Mobile
4,id80136,This hotel is awesome I love the service Antho...,Mozilla,Mobile


In [60]:
#check if there are any missing values
train.isnull().sum()

User_ID         0
Description     0
Browser_Used    0
Device_Used     0
Is_Response     0
dtype: int64

In [61]:
from scipy.stats import chi2_contingency
for col in train.columns:
    train[col]=train[col].astype('str')
    
#Null hypothesis there is no significant relationship between the variables
device_tab = pd.crosstab(train.Device_Used,train.Is_Response)
device_tab

Is_Response,happy,not happy
Device_Used,Unnamed: 1_level_1,Unnamed: 2_level_1
Desktop,10595,4431
Mobile,10602,4374
Tablet,5324,3606


In [62]:
chi2_contingency(device_tab)

(385.99747606801725,
 1.5195432214497265e-84,
 2,
 array([[ 10235.91251413,   4790.08748587],
        [ 10201.85184424,   4774.14815576],
        [  6083.23564163,   2846.76435837]]))

In [63]:
browser_tab = pd.crosstab(train.Browser_Used,train.Is_Response)
browser_tab

Is_Response,happy,not happy
Browser_Used,Unnamed: 1_level_1,Unnamed: 2_level_1
Chrome,2103,367
Edge,3218,3916
Firefox,3685,3682
Google Chrome,4035,624
IE,2099,340
Internet Explorer,1845,258
InternetExplorer,3967,621
Mozilla,1841,1251
Mozilla Firefox,3203,1125
Opera,241,121


In [64]:
chi2_contingency(browser_tab)

(5536.8794651171429, 0.0, 10, array([[ 1682.59709237,   787.40290763],
        [ 4859.77637933,  2274.22362067],
        [ 5018.499101  ,  2348.500899  ],
        [ 3173.77321997,  1485.22678003],
        [ 1661.47947704,   777.52052296],
        [ 1432.5917754 ,   670.4082246 ],
        [ 3125.40706874,  1462.59293126],
        [ 2106.31182575,   985.68817425],
        [ 2948.29158533,  1379.70841467],
        [  246.59924997,   115.40075003],
        [  265.67322511,   124.32677489]]))

In [65]:
browser_device = pd.crosstab(train.Browser_Used,train.Device_Used)
browser_device

Device_Used,Desktop,Mobile,Tablet
Browser_Used,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Chrome,977,940,553
Edge,2707,2637,1790
Firefox,2791,2756,1820
Google Chrome,1831,1842,986
IE,907,961,571
Internet Explorer,819,816,468
InternetExplorer,1835,1819,934
Mozilla,1168,1233,691
Mozilla Firefox,1695,1692,941
Opera,142,140,80


In [66]:
chi2_contingency(browser_device)
#There is significant relationship between all pairs of variables as p values falls in the rejection rate


(69.325062238481436,
 2.3465220953907307e-07,
 20,
 array([[  953.30884619,   950.13664852,   566.55450529],
        [ 2753.40295901,  2744.24083017,  1636.35621083],
        [ 2843.33047365,  2833.86910511,  1689.80042125],
        [ 1798.16433782,  1792.18082811,  1068.65483407],
        [  941.34424124,   938.21185657,   559.44390219],
        [  811.66336176,   808.96249872,   482.37413953],
        [ 1770.76153293,  1764.86920785,  1052.36925922],
        [ 1193.37285523,  1189.40182883,   709.22531594],
        [ 1670.41323333,  1664.8548238 ,   992.73194287],
        [  139.71570944,   139.25079626,    83.0334943 ],
        [  150.5224494 ,   150.02157608,    89.45597452]]))

# Data Cleaning

In [67]:
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import string
import re
wordnet_lemmatizer = WordNetLemmatizer()
snowball_stemmer = SnowballStemmer("english")
stopwords = set(stopwords.words("english"))
def clean_description(row):
    txt = str(row['Description'])
    txt = re.sub(r'[^A-Za-z0-9\s]',r'',txt)
    txt = re.sub(r'\n',r' ',txt)
    return " ".join([snowball_stemmer.stem(x)for x in txt.lower().split() if x not in stopwords])    

In [68]:
import numpy as np
test['Is_response'] = np.nan
alldata = pd.concat([train, test]).reset_index(drop=True)

In [69]:
alldata['Description'] = alldata.apply(clean_description,axis=1)

In [70]:
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.model_selection import cross_val_score,train_test_split
from sklearn.metrics import accuracy_score,make_scorer

cv = CountVectorizer(min_df=150,max_features=500,analyzer='word', ngram_range = (1,1))

In [71]:
count_vector = cv.fit_transform(alldata['Description'])
count_vector_df = pd.DataFrame(count_vector.todense())

In [72]:
#split count vector into train and test
count_vector_df.columns = ['col_' + str(x) for x in cv.vocabulary_ .keys()]
count_vector_train = count_vector_df[:len(train)]
count_vector_test = count_vector_df[len(train):]

#split all data into train and test
train = alldata[~pd.isnull(alldata.Is_Response)]
test = alldata[pd.isnull(alldata.Is_Response)]

In [73]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
train_y = le.fit_transform(train['Is_Response'])


<h6>Use other features</h6>

In [74]:
train = pd.concat([train,count_vector_train],axis=1)
train.shape

(38932, 506)

In [75]:
browser_encoder = LabelEncoder()
train['Browser_Used'] = browser_encoder.fit_transform(train['Browser_Used'])
device_encoder = LabelEncoder()
train['Device_Used'] = device_encoder.fit_transform(train['Device_Used'])
train.drop(['User_ID','Description','Is_Response'],axis=1,inplace=True)
train.sort_index(axis=1, inplace=True)
from xgboost import XGBClassifier
model = XGBClassifier(max_depth=5, n_estimators=460, learning_rate=0.05,scale_pos_weight = 1,min_child_weight = 2,gamma = 0.0,subsample =0.5, colsample_bytree = 0.5,max_delta_step=1,objective = 'binary:logistic')
model.fit(train,train_y)
scorer = make_scorer(accuracy_score)
score = cross_val_score(model,train,train_y,scoring=scorer,cv=5)
score

array([ 0.87172573,  0.87631647,  0.87978423,  0.87156435,  0.87349088])

In [76]:
train.columns

Index(['Browser_Used', 'Device_Used', 'Is_response', 'col_abl', 'col_absolut',
       'col_access', 'col_accommod', 'col_across', 'col_actual', 'col_air',
       ...
       'col_without', 'col_wonder', 'col_work', 'col_worth', 'col_would',
       'col_wouldnt', 'col_year', 'col_yes', 'col_york', 'col_your'],
      dtype='object', length=503)

In [77]:

#test_cv = pd.DataFrame(count_vector_test.todense())
test = pd.concat([test,count_vector_test],axis=1)
test['Browser_Used'] = browser_encoder.transform(test['Browser_Used'])
test['Device_Used'] = device_encoder.transform(test['Device_Used'])

In [78]:
test_temp = test[train.columns]
#test.sort_index(axis=1, inplace=True)
test_temp.columns

Index(['Browser_Used', 'Device_Used', 'Is_response', 'col_abl', 'col_absolut',
       'col_access', 'col_accommod', 'col_across', 'col_actual', 'col_air',
       ...
       'col_without', 'col_wonder', 'col_work', 'col_worth', 'col_would',
       'col_wouldnt', 'col_year', 'col_yes', 'col_york', 'col_your'],
      dtype='object', length=503)

In [79]:
cols_to_use = list(set(test.columns) - set(['User_ID','Description','Is_Response']))
len(cols_to_use)

503

In [80]:
y_pred = model.predict(test_temp)
y_pred

array([1, 0, 0, ..., 0, 0, 0], dtype=int64)

In [81]:
test['Is_Response']=le.inverse_transform(y_pred)
test.to_csv("happiness.csv",columns=['User_ID','Is_Response'],index=False)