In [1]:
import sklearn
import pickle
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [2]:
sentimental_data = pd.read_csv('datasets/sentimental_analysis_data.csv', 
                               header=None, 
                               names=['Label', 'Text'], 
                               sep='\t')

sentimental_data.sample(10)

Unnamed: 0,Label,Text
926,1,I love The Da Vinci Code...
6604,0,So Brokeback Mountain was really depressing.
2931,1,"I, too, like Harry Potter.."
919,1,Love luv lubb the Da Vinci Code!
2055,1,The Harry Potter scar would be cool...
3811,1,"Anyway, thats why I love "" Brokeback Mountain."
5055,0,meganpenworthy dressed as a character from Har...
6532,0,"Oh, and Brokeback Mountain is a TERRIBLE movie..."
2517,1,I love Harry Potter..
4831,0,"Da Vinci Code = Up, Up, Down, Down, Left, Righ..."


In [3]:
sentimental_data.shape

(6918, 2)

In [4]:
X = sentimental_data['Text']

Y = sentimental_data['Label']

In [5]:
X

0                 The Da Vinci Code book is just awesome.
1       this was the first clive cussler i've ever rea...
2                        i liked the Da Vinci Code a lot.
3                        i liked the Da Vinci Code a lot.
4       I liked the Da Vinci Code but it ultimatly did...
                              ...                        
6913                       Brokeback Mountain was boring.
6914         So Brokeback Mountain was really depressing.
6915    As I sit here, watching the MTV Movie Awards, ...
6916      Ok brokeback mountain is such a horrible movie.
6917     Oh, and Brokeback Mountain was a terrible movie.
Name: Text, Length: 6918, dtype: object

In [8]:
# Y

In [9]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

In [11]:
x_train.shape, y_train.shape

((5534,), (5534,))

In [13]:
x_test.shape, y_test.shape

((1384,), (1384,))

In [14]:
tfidf_vect = TfidfVectorizer(max_features=15)

In [15]:
logistic_clf = LogisticRegression(solver='liblinear')

In [16]:
clf_pipeline = Pipeline(steps=[('tfidf_vect', tfidf_vect), ('classifier', logistic_clf)])

pipeline_model = clf_pipeline.fit(x_train, y_train)

In [17]:
y_pred = pipeline_model.predict(x_test)

In [18]:
Accuracy_score = accuracy_score(y_test, y_pred)

Accuracy_score

0.9053468208092486

In [19]:
pickle.dump(pipeline_model, open('models/logistic_clf/model.pkl', 'wb'))

In [20]:
decision_tree_clf = DecisionTreeClassifier(max_depth=10)

In [21]:
clf_pipeline = Pipeline(steps=[('tfidf_vect', tfidf_vect), ('classifier', decision_tree_clf)])

pipeline_model = clf_pipeline.fit(x_train, y_train)

In [22]:
y_pred = pipeline_model.predict(x_test)

In [23]:
Accuracy_score = accuracy_score(y_test, y_pred)

Accuracy_score

0.9010115606936416

In [24]:
pickle.dump(pipeline_model, open('models/decision_tree_clf/model.pkl', 'wb'))

In [25]:
linear_svc_clf = LinearSVC(C=1.0, max_iter=100)

In [26]:
clf_pipeline = Pipeline(steps=[('tfidf_vect', tfidf_vect), ('classifier', linear_svc_clf)])

pipeline_model = clf_pipeline.fit(x_train, y_train)

In [27]:
y_pred = pipeline_model.predict(x_test)

In [28]:
Accuracy_score = accuracy_score(y_test, y_pred)

Accuracy_score

0.9046242774566474

In [29]:
pickle.dump(pipeline_model, open('models/linear_svc_clf/model.pkl', 'wb'))

In [30]:
sklearn.__version__

'0.22.2.post1'