## Import Library

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import xgboost as xgb

### Read Preporcessed data

In [4]:
dataset = pd.read_csv('../data/processed/pre_processed_dataset.csv', encoding="iso-8859-1")

In [5]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600000 entries, 0 to 1599999
Data columns (total 7 columns):
 #   Column           Non-Null Count    Dtype 
---  ------           --------------    ----- 
 0   target           1600000 non-null  int64 
 1   ids              1600000 non-null  int64 
 2   date             1600000 non-null  object
 3   flag             1600000 non-null  object
 4   user             1600000 non-null  object
 5   text             1600000 non-null  object
 6   stemmed_content  1599505 non-null  object
dtypes: int64(2), object(5)
memory usage: 85.4+ MB


In [6]:
df = dataset['stemmed_content']

In [7]:
df.loc[df.isnull()] = ''

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[df.isnull()] = ''


In [8]:
dataset['stemmed_content'] = df

In [9]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600000 entries, 0 to 1599999
Data columns (total 7 columns):
 #   Column           Non-Null Count    Dtype 
---  ------           --------------    ----- 
 0   target           1600000 non-null  int64 
 1   ids              1600000 non-null  int64 
 2   date             1600000 non-null  object
 3   flag             1600000 non-null  object
 4   user             1600000 non-null  object
 5   text             1600000 non-null  object
 6   stemmed_content  1600000 non-null  object
dtypes: int64(2), object(5)
memory usage: 85.4+ MB


In [10]:
dataset.head()

Unnamed: 0,target,ids,date,flag,user,text,stemmed_content
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - awww, t...",switchfoot http twitpic com zl awww bummer sho...
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his facebook by ...,upset updat facebook text might cri result sch...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@kenichan i dived many times for the ball. man...,kenichan dive mani time ball manag save rest g...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,whole bodi feel itchi like fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....",nationwideclass behav mad see


### split the data to 20% for train && 80% for train

In [11]:
X = dataset['stemmed_content']
Y = dataset['target']

In [12]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=42)

### Vectorization, Fit and transform Data

In [13]:
vectorize = TfidfVectorizer()

In [14]:
X_train = vectorize.fit_transform(X_train)

In [15]:
X_test = vectorize.transform(X_test)

In [16]:
print(X_train)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 9453607 stored elements and shape (1280000, 461280)>
  Coords	Values
  (0, 307108)	0.46206048815324474
  (0, 239679)	0.15130037108228483
  (0, 146067)	0.12929728405657018
  (0, 175252)	0.224070805470346
  (0, 128605)	0.22108856600702773
  (0, 4832)	0.317074267861159
  (0, 124524)	0.18318401951949756
  (0, 205794)	0.24140229063801746
  (0, 454381)	0.20169626473577715
  (0, 286478)	0.16123218610004272
  (0, 406297)	0.2978221095272138
  (0, 220296)	0.43015677907624866
  (0, 388138)	0.20555120011808467
  (0, 154767)	0.26976607043258233
  (1, 445870)	0.6361096685891185
  (1, 161801)	0.5778049407933611
  (1, 124611)	0.5113765148324884
  (2, 125319)	0.6383069130836649
  (2, 349409)	0.22232944888223494
  (2, 444761)	0.30331529032956345
  (2, 358186)	0.19837942712286838
  (2, 267649)	0.19309660201644555
  (2, 12436)	0.2529872032123258
  (2, 453420)	0.2347069337186747
  (2, 312657)	0.3154702974657607
  :	:
  (1279997, 124611)	0.253778

### Model Building

In [17]:
model = xgb.XGBClassifier(tree_method = 'hist', early_stopping_rounds = 2)

In [18]:
model.fit(X_train, Y_train, eval_set=[(X_train, Y_train)])

[0]	validation_0-logloss:0.67353
[1]	validation_0-logloss:0.66136
[2]	validation_0-logloss:0.65231
[3]	validation_0-logloss:0.64486
[4]	validation_0-logloss:0.63830
[5]	validation_0-logloss:0.63318
[6]	validation_0-logloss:0.62835
[7]	validation_0-logloss:0.62378
[8]	validation_0-logloss:0.62001
[9]	validation_0-logloss:0.61635
[10]	validation_0-logloss:0.61318
[11]	validation_0-logloss:0.61036
[12]	validation_0-logloss:0.60771
[13]	validation_0-logloss:0.60462
[14]	validation_0-logloss:0.60212
[15]	validation_0-logloss:0.59983
[16]	validation_0-logloss:0.59764
[17]	validation_0-logloss:0.59529
[18]	validation_0-logloss:0.59334
[19]	validation_0-logloss:0.59151
[20]	validation_0-logloss:0.58929
[21]	validation_0-logloss:0.58743
[22]	validation_0-logloss:0.58576
[23]	validation_0-logloss:0.58406
[24]	validation_0-logloss:0.58251
[25]	validation_0-logloss:0.58104
[26]	validation_0-logloss:0.57963
[27]	validation_0-logloss:0.57825
[28]	validation_0-logloss:0.57689
[29]	validation_0-loglos

### Model Evaluation

In [19]:
train_score = model.score(X_train, Y_train)
test_score = model.score(X_test, Y_test)

In [20]:
print(train_score)

0.7447078125


In [21]:
print(test_score)

0.74119375


In [22]:
y_pred = model.predict(X_test[-2])

In [23]:
print(y_pred)

[0]


In [24]:
Y_test

291262     0
1359075    1
1009651    1
1538714    1
1046690    1
          ..
203556     0
163122     0
1176116    1
477177     0
897066     1
Name: target, Length: 320000, dtype: int64

### Hyper Parameter Tuning

In [25]:
model2 = xgb.XGBClassifier(tree_method = 'hist', eta = 0.9, early_stopping_rounds = 2, gamma = 4, objective="binary:logistic", max_depth = 8, subsample = 0.6)

In [26]:
model2.fit(X_train, Y_train, eval_set=[(X_train, Y_train)])

[0]	validation_0-logloss:0.64525
[1]	validation_0-logloss:0.62310
[2]	validation_0-logloss:0.60913
[3]	validation_0-logloss:0.59878
[4]	validation_0-logloss:0.59057
[5]	validation_0-logloss:0.58283
[6]	validation_0-logloss:0.57715
[7]	validation_0-logloss:0.57192
[8]	validation_0-logloss:0.56735
[9]	validation_0-logloss:0.56260
[10]	validation_0-logloss:0.55876
[11]	validation_0-logloss:0.55455
[12]	validation_0-logloss:0.55133
[13]	validation_0-logloss:0.54835
[14]	validation_0-logloss:0.54551
[15]	validation_0-logloss:0.54271
[16]	validation_0-logloss:0.54046
[17]	validation_0-logloss:0.53831
[18]	validation_0-logloss:0.53625
[19]	validation_0-logloss:0.53424
[20]	validation_0-logloss:0.53225
[21]	validation_0-logloss:0.53057
[22]	validation_0-logloss:0.52875
[23]	validation_0-logloss:0.52697
[24]	validation_0-logloss:0.52559
[25]	validation_0-logloss:0.52418
[26]	validation_0-logloss:0.52293
[27]	validation_0-logloss:0.52172
[28]	validation_0-logloss:0.52033
[29]	validation_0-loglos

In [27]:
test_score = model2.score(X_test, Y_test)

In [28]:
print(test_score)

0.760096875
