In [4]:
import sklearn
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer

In [5]:
scikit_learn_version = sklearn.__version__
scikit_learn_version

'0.21.2'

In [11]:
df=pd.read_csv('datasets/Sentimental_Analysis/train_data.csv',
               header=None,
               names=['Label','Text'],
               sep='\t')
df.head()

Unnamed: 0,Label,Text
0,1,The Da Vinci Code book is just awesome.
1,1,this was the first clive cussler i've ever rea...
2,1,i liked the Da Vinci Code a lot.
3,1,i liked the Da Vinci Code a lot.
4,1,I liked the Da Vinci Code but it ultimatly did...


In [12]:
df.shape

(6918, 2)

In [14]:
X=df['Text']
Y=df['Label']

In [15]:
x_train, x_test, y_train, y_test = train_test_split(X,Y, test_size=0.2)

In [16]:
tfidf_vect = TfidfVectorizer(max_features=15)
x_trains = tfidf_vect.fit_transform(x_train)

In [18]:
print(x_trains[0:3])

  (0, 3)	0.5138355372103066
  (0, 13)	0.5139796330722513
  (0, 4)	0.5139796330722513
  (0, 12)	0.45565657492636247
  (1, 11)	0.7071067811865475
  (1, 5)	0.7071067811865475
  (2, 1)	0.5099810198958844
  (2, 14)	0.5017655138665356
  (2, 3)	0.40330643728141863
  (2, 13)	0.4034195372608856
  (2, 4)	0.4034195372608856


In [19]:
x_trains.shape

(5534, 15)

In [27]:
classifiers = LinearSVC(C=1.0, max_iter=1000,tol=1e-3)
linear_scv_model = classifiers.fit(x_trains, y_train)
linear_scv_model

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.001,
          verbose=0)

In [28]:
x_test_trains= tfidf_vect.fit_transform(x_test)

In [29]:
x_test_trains.shape

(1384, 15)

In [30]:
y_pred=linear_scv_model.predict(x_test_trains)
y_pred

array([0, 0, 1, ..., 0, 1, 1], dtype=int64)

In [31]:
pred_result=pd.DataFrame({'y_test':y_test,
                         'y_pred': y_pred})
pred_result.sample(5)

Unnamed: 0,y_test,y_pred
6844,0,0
2743,1,1
553,1,1
3512,1,0
5488,0,0


In [32]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test,y_pred)
accuracy

0.8930635838150289

In [33]:
text_clf_param = {}
text_clf_param['preprocessing']= tfidf_vect
text_clf_param['model']= linear_scv_model
text_clf_param['sklearn_version']=scikit_learn_version
text_clf_param['accuracy']=accuracy

In [34]:
import joblib

In [36]:
filename='models/text_clf_checkpoint.joblib'

In [37]:
joblib.dump(text_clf_param,filename)

['models/text_clf_checkpoint.joblib']

In [38]:
clf_checkpoint = joblib.load(filename)

In [40]:
reloaded_vect = clf_checkpoint['preprocessing']
reloaded_vect

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=15,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [41]:
clf_model = clf_checkpoint['model']
clf_model

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.001,
          verbose=0)

In [42]:
x_text_trans_new = reloaded_vect.fit_transform(x_test)

In [44]:
y_pred = clf_model.predict(x_text_trans_new)
y_pred

array([0, 0, 1, ..., 0, 1, 1], dtype=int64)

In [45]:
accuracy_score(y_test,y_pred), clf_checkpoint['accuracy']

(0.8930635838150289, 0.8930635838150289)

In [47]:
from sklearn.pipeline import Pipeline

In [48]:
clf_pipeline = Pipeline(steps=[('tfidf_vect',tfidf_vect),('classifiers',classifiers)])
pipeline_model = clf_pipeline.fit(x_train,y_train)

In [49]:
y_pred = pipeline_model.predict(x_test)

In [50]:
accuracy=accuracy_score(y_test,y_pred)
accuracy

0.8930635838150289

In [51]:
pipe_clf_param ={}
pipe_clf_param['pipeline_clf']=pipeline_model
pipe_clf_param['sklearn_version']=scikit_learn_version
pipe_clf_param['accuracy']=accuracy

In [54]:
filename='models/pipe_clf_checkpoint.joblib'

In [55]:
joblib.dump(pipe_clf_param,filename)

['models/pipe_clf_checkpoint.joblib']

In [56]:
pipe_clf_checkpoint =joblib.load(filename)

In [57]:
reloaded_pipeline = pipe_clf_checkpoint['pipeline_clf']
reloaded_pipeline

Pipeline(memory=None,
         steps=[('tfidf_vect',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=15,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('classifiers',
                 LinearSVC(C=1.0, class_weight=None, dual=True,
                           fit_intercept=True, intercept_scaling=1,
    

In [58]:
y_pred= reloaded_pipeline.predict(x_test)

In [59]:
accuracy_score(y_test,y_pred)

0.8930635838150289

In [60]:
pipe_clf_checkpoint['accuracy']

0.8930635838150289