In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import multilabel_confusion_matrix

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [2]:
data=pd.read_csv('train.csv')

(                                                text sentiment
 0   oh Marly, I`m so sorry!!  I hope you find her...   neutral
 1  Playing Ghost Online is really interesting. Th...  positive
 2  is cleaning the house for her family who is co...   neutral
 3  gotta restart my computer .. I thought Win7 wa...   neutral
 4  SEe waT I Mean bOuT FoLL0w fRiiDaYs... It`S cA...   neutral, (27448, 2))

In [3]:
X=data.text
y=data.sentiment
y=pd.get_dummies(y)

22344    is soo bored its lovley day outside but nuttin...
12578    _007 so I tried to send you a direct message a...
11896    half of me wants to go to sleep, half of me do...
15535     Ooh, I love sweet potato fries! We should def...
11008     sold, so i hit cash and just took the tax out...
Name: text, dtype: object

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=55,stratify=y)

In [4]:
# keeping only alphabets
X_train=X_train.str.replace("[^a-zA-Z]", " ")

# changing all text to lowercase
X_train=X_train.str.lower()

22344    is soo bored its lovley day outside but nuttin...
12578         so i tried to send you a direct message a...
11896    half of me wants to go to sleep  half of me do...
15535     ooh  i love sweet potato fries  we should def...
11008     sold  so i hit cash and just took the tax out...
Name: text, dtype: object

In [7]:
lem=WordNetLemmatizer()
stop=stopwords.words('english')

In [8]:
# function to lemmatize text
def lemm_text(string):
    temp=word_tokenize(string)
    # only processing tokens that aren't in the STOP WORDS list
    word_list=[lem.lemmatize(word) for word in temp if not word in stop]
    return " ".join(word_list)

In [9]:
# lemmatizing text
X_train=X_train.apply(lambda x:lemm_text(str(x)))

22344                  soo bored lovley day outside nuttin
12578                  tried send direct message following
11896    half want go sleep half dosen shoot half dying...
15535              ooh love sweet potato fry definitely go
11008    sold hit cash took tax tip drawer ended dollar...
Name: text, dtype: object

In [10]:
# creating the tfidf matrix
tfidf=TfidfVectorizer()
tfidf.fit(X_train)
tr_text=tfidf.transform(X_train)

<20586x18896 sparse matrix of type '<class 'numpy.float64'>'
	with 141544 stored elements in Compressed Sparse Row format>

In [11]:
train_df=pd.DataFrame(tr_text.toarray(), columns=tfidf.get_feature_names())

Unnamed: 0,aa,aaa,aaaa,aaaaaaaaaaa,aaaaaaaaaahhhhhhhh,aaaaaaaaaamazing,aaaaaaaahhhhhhhh,aaaaaah,aaaaaahhhhhhhh,aaaaahhhh,...,zv,zwarte,zwitschert,zx,zxoj,zywwj,zzre,zzzz,zzzzy,zzzzzzzzzzzzzzz
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

In [13]:
clf=OneVsRestClassifier(LogisticRegression())
clf.fit(train_df,y_train)



OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='warn',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='warn', tol=0.0001,
                                                 verbose=0, warm_start=False),
                    n_jobs=None)

In [14]:
# testing performance on training set
pred=clf.predict(train_df)
multilabel_confusion_matrix(pred,y_train)

array([[[14458,  2593],
        [  303,  3232]],

       [[11115,  2192],
        [ 1141,  6138]],

       [[13673,  2202],
        [  482,  4229]]], dtype=int64)

In [15]:
## Testing phase

In [16]:
# preprocessing steps
X_test=X_test.str.replace("[^a-zA-Z]", " ")
X_test=X_test.str.lower()
X_test=X_test.apply(lambda x:lemm_text(str(x)))

In [None]:
# transforming to tfidf features
tst_text=tfidf.transform(X_test)

In [17]:
test_df=pd.DataFrame(tst_text.toarray(), columns=tfidf.get_feature_names())

In [18]:
# predictions
pred_test=clf.predict(test_df)

# testing performance on test set
multilabel_confusion_matrix(pred_test,y_test)

array([[[4693, 1005],
        [ 227,  937]],

       [[3281, 1146],
        [ 805, 1630]],

       [[4474,  872],
        [ 244, 1272]]], dtype=int64)

In [21]:
# saving the models for later use
from sklearn.externals import joblib
joblib.dump(tfidf, 'C:/ureka/tfidfVectorizer.pkl')
joblib.dump(clf, 'C:/ureka/classifier.pkl')



['C:/ureka/classifier.pkl']