# Pipline - Use case 1

In [32]:
import pandas as pd
import nltk
from nltk.corpus import stopwords    
import random 
nltk.download('stopwords')
import numpy as np
from sklearn.model_selection import train_test_split

from sklearn.base import BaseEstimator
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import Normalizer
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\onkar.patil\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\onkar.patil\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\onkar.patil\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [15]:
# load the dataset
df=pd.read_csv('Text_data.csv',index_col=[0])

In [3]:
df

Unnamed: 0,review,sentiment,length,comma,stopwords
0,One of the other reviewers has mentioned that ...,positive,307,26.0,48.0
1,A wonderful little production. <br /><br />The...,positive,162,5.0,26.0
2,I thought this was a wonderful way to spend ti...,positive,166,6.0,32.0
3,Basically there's a family where a little boy ...,negative,138,3.0,28.0
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,230,12.0,39.0
...,...,...,...,...,...
49995,I thought this movie did a down right good job...,positive,194,16.0,43.0
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative,112,8.0,25.0
49997,I am a Catholic taught in parochial elementary...,negative,230,5.0,37.0
49998,I'm going to have to disagree with the previou...,negative,212,9.0,38.0


In [16]:
#split train and test
X_train, X_test, y_train, y_test = train_test_split(df.drop('sentiment',axis=1), df['sentiment'], test_size=0.33, random_state=42)

## Build a pipeline 

### Steps :
#### 1. Create a pipeline modules for each df column with parameter to tune
#### 2. Combine all the modules and add estimator 
#### 3. Hyper parameter tuning with Gridcv

In [20]:
# create pipeline modules for each column



#Import sklearn base estimator to wrap the function
from sklearn.base import BaseEstimator

# This is your non-sklearn data transform function specific to your data
class Process():
    def __init__(self):
        pass
    def transform(self,x):
        #k=np.array([int(i)+10 for i in x]).reshape(-1, 1)
        #print(k.shape)
        return np.array([int(i)+10 for i in x]).reshape(-1, 1)
    
# This is how we use 'BaseEstimator' to wrap "Process" using "Transform" class       
class Transform(BaseEstimator):
    def __init__(self):
        self.t = Process()
        
    def fit(self, x, y=None):
        return self
    
    def transform(self, x):
        return self.t.transform(x)


# for Text function     
    
class Lemmatize(BaseEstimator):
    def __init__(self):
        self.l = WordNetLemmatizer()
        
    def fit(self, x, y=None):
        return self
    
    def transform(self, x):
        x = map(lambda r:  ' '.join([self.l.lemmatize(i.lower()) for i in r.split()]), x)
        x = np.array(list(x))
        return x

# 1. comma features 

comma_pipe=Pipeline([
    ('imputer1',SimpleImputer(missing_values=np.nan, strategy='mean')),
    ('process1',Transform()),
    ('Normalize1',Normalizer()),
])

# 2. stopwords feature

stop_pipe=Pipeline([
    ('imputer1',SimpleImputer(missing_values=np.nan, strategy='most_frequent')),
    ('process1',Transform()),
])


# 3. review

review_pipe=Pipeline([
    ('lemma',Lemmatize()),
    ('tfidf', TfidfVectorizer(max_features=2500)),
])


# combine all the pipeline modeules to create Data transform pipeline

preprocessor = ColumnTransformer([
    ("comma", comma_pipe, ["comma"]),
    ("stopwords", stop_pipe, ["stopwords"]),
    ("review", review_pipe, "review"),
],remainder="passthrough")

pre_pipe=Pipeline([
    ('preprocess',preprocessor)
])

In [21]:
pre_pipe.fit_transform(X_train)


<33500x2503 sparse matrix of type '<class 'numpy.float64'>'
	with 3666373 stored elements in Compressed Sparse Row format>

In [27]:
# Add model estimator

# 1. Logistic Regression

logistic=Pipeline([('preprocess',pre_pipe),
                  ('lg',LogisticRegression()),
                  ])


# 2. svm
svm=Pipeline([('preprocess',pre_pipe),
                  ('svc',SVC())])

# 2. svm
rf=Pipeline([('preprocess',pre_pipe),
                  ('random',RandomForestClassifier())])

## Training and hyper parameter tuning

In [33]:
# logistic regression

# simple cross validation

logistic.fit(X_train, y_train)
print(logistic.score(X_test,y_test))

0.8643636363636363


In [39]:
print('Actual result:',y_test[:5].tolist())
print('\nPredicted result:',logistic.predict(X_test[:5]))

Actual result: ['positive', 'positive', 'negative', 'positive', 'negative']

Predicted result: ['negative' 'positive' 'negative' 'positive' 'negative']


In [43]:
# Hyperparameter tuning using GridsearchCV

scoring='roc_auc'
cv=3
n_jobs=-1

param_grid = [{
    'preprocess__preprocess__review__tfidf__max_features': [2500, 5000, 10000], 
    'lg__C': [1., 3.],
}]

grid = GridSearchCV(logistic, cv=cv, n_jobs=n_jobs, param_grid=param_grid,
                          scoring=scoring, verbose=1)
grid.fit(df.drop('sentiment',axis=1), df['sentiment'])
grid.cv_results_

Fitting 3 folds for each of 6 candidates, totalling 18 fits


{'mean_fit_time': array([18.7731901 , 19.59779628, 20.22611721, 19.60668445, 20.42669153,
        15.15031123]),
 'std_fit_time': array([0.34376869, 0.07886658, 0.35580649, 0.41647155, 0.05353523,
        3.79509243]),
 'mean_score_time': array([6.24255363, 6.33001868, 6.9442548 , 7.04272763, 7.34694839,
        4.90396953]),
 'std_score_time': array([0.09049086, 0.0380744 , 0.38398387, 0.11548303, 0.04911564,
        1.65623614]),
 'param_lg__C': masked_array(data=[1.0, 1.0, 1.0, 3.0, 3.0, 3.0],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_preprocess__preprocess__review__tfidf__max_features': masked_array(data=[2500, 5000, 10000, 2500, 5000, 10000],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'lg__C': 1.0,
   'preprocess__preprocess__review__tfidf__max_features': 2500},
  {'lg__C': 1.0, 'preprocess__preprocess__review__tfidf_

In [101]:
review_pipe=Pipeline([
    #('lemma',Lemmatize()),
    ('tfidf', TfidfVectorizer(max_features=2500)),
])

In [88]:
review_pipe.fit_transform(df.review)

<50000x2500 sparse matrix of type '<class 'numpy.float64'>'
	with 5332104 stored elements in Compressed Sparse Row format>

In [98]:
preprocessor = ColumnTransformer(transformers=[("rev", review_pipe, ["review"])])

In [99]:
preprocessor.fit_transform(df)

array([[1.]])

In [100]:
df

Unnamed: 0,review,sentiment,length,comma,stopwords
0,One of the other reviewers has mentioned that ...,positive,307,26.0,48.0
1,A wonderful little production. <br /><br />The...,positive,162,5.0,26.0
2,I thought this was a wonderful way to spend ti...,positive,166,6.0,32.0
3,Basically there's a family where a little boy ...,negative,138,3.0,28.0
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,230,12.0,39.0
...,...,...,...,...,...
49995,I thought this movie did a down right good job...,positive,194,16.0,43.0
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative,112,8.0,25.0
49997,I am a Catholic taught in parochial elementary...,negative,230,5.0,37.0
49998,I'm going to have to disagree with the previou...,negative,212,9.0,38.0


In [50]:
k=df.review

In [65]:
l=Lemmatize()

In [66]:
l.fit(p)
ls=l.transform(p)

enter lema
finish lama


In [67]:
ls.shape

(50000,)

In [68]:
tf=TfidfVectorizer(max_features=2500)

In [69]:
tf.fit_transform(ls)

<50000x2500 sparse matrix of type '<class 'numpy.float64'>'
	with 5335657 stored elements in Compressed Sparse Row format>

In [60]:
pre = ColumnTransformer([
    ("review", review_pipe, ["review"]),
])

In [70]:
p=df['review']
p

0        One of the other reviewers has mentioned that ...
1        A wonderful little production. <br /><br />The...
2        I thought this was a wonderful way to spend ti...
3        Basically there's a family where a little boy ...
4        Petter Mattei's "Love in the Time of Money" is...
                               ...                        
49995    I thought this movie did a down right good job...
49996    Bad plot, bad dialogue, bad acting, idiotic di...
49997    I am a Catholic taught in parochial elementary...
49998    I'm going to have to disagree with the previou...
49999    No one expects the Star Trek movies to be high...
Name: review, Length: 50000, dtype: object

In [71]:
review_pipe.fit(p)

enter lema
finish lama


Pipeline(steps=[('lemma', Lemmatize()),
                ('tfidf', TfidfVectorizer(max_features=2500))])

In [72]:
review_pipe.transform(p)

enter lema
finish lama


<50000x2500 sparse matrix of type '<class 'numpy.float64'>'
	with 5335657 stored elements in Compressed Sparse Row format>