# Importing the required libraries

In [253]:
import pandas as pd
import numpy as np
import neattext.functions as nfx
import joblib
from sklearn.pipeline import Pipeline 
import string 
import re
import nltk
import time

In [254]:
import joblib
import pickle 

# Importing the models

In [255]:
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import RidgeClassifier 
from sklearn.linear_model import SGDClassifier 
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import LinearRegression

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.svm import SVR

# Importing Preprocessing mathods

In [256]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix as score 
from sklearn.model_selection import GridSearchCV

In [257]:
def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [ps.stem(word) for word in tokens if word not in stopwords]
    return text

stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()

# creating the trainpredict class

- > The class contains the models which  can train with data and also predict the test data and give the metrics , all at once.


In [258]:
class BestModel():
    
    def __init__(self):
        print("finding the best model")
    
    def best_model(self):
        
        r1 = TrainAndPredict.Ridge_Classifier(self)
        r2 = TrainAndPredict.RandomForest_Classifier(self)
        frames = [r1, r2]
        df = pd.concat(frames)
        return df.sort_values('mean_test_score', ascending=False)
        
        

In [259]:
class TrainAndPredict(BestModel):
    
    def __init__(self, train, vect_train,  target):
        self.vect_train = vect_train
        self.target = target
    
    
    
        
    def RandomForest_Classifier(self):
        
        rf = RandomForestClassifier()
        param = {'n_estimators': [10, 150],
                'max_depth': [30, None]}
        model_name = ["Randomforestclassifier"] * 4
        gs = GridSearchCV(rf, param, cv=5)
        gs_fit = gs.fit(pd.DataFrame(self.vect_train), self.train[self.target])
        
        results = pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score', ascending=False)
        results["model_name"] = model_name 
        results = results.drop(["split0_test_score", "split1_test_score", "split2_test_score", "split3_test_score", "split4_test_score", "param_max_depth", "param_n_estimators", "rank_test_score"], axis = 1)
        return results 
        
    def Ridge_Classifier(self):
        
        rc = RidgeClassifier(random_state = 42)
        param = {"alpha" : [1.0, 1.5, 2.0]}
        gs = GridSearchCV(rc, param, cv = 4)
        model_name = ["Ridge_classifier"] * 3
        gs_fit = gs.fit(pd.DataFrame(self.vect_train), self.train[self.target])
        
        results = pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score', ascending=False)
        results["model_name"] = model_name
        results =  results.drop(["split0_test_score", "split1_test_score", "split2_test_score", "split3_test_score", "param_alpha", "rank_test_score"], axis = 1)
        return results
        
        
   


    def saveas_pickle(self):
        pipeline_file = open("airline_sentiment_final.pkl","wb")
        joblib.dump(self.model, pipeline_file)
        pipeline_file.close()
        print("model saved as pickle")
           
        
        

        

# creating Vectorize class

- > The class contains vectorizers like tfidf and count which convert the text into columns which are used for training.

- > the methods in the class use the main text in the data and convert into columns(bag of words) and return it to train_predict class

In [260]:
class Vectorize(TrainAndPredict):
    
    def __init__(self, train,  target):
        self.train = train
        self.target = target
        

           
    def Tfidf(self, name):
        vectorizer = TfidfVectorizer(analyzer = clean_text)
        vect_fit = vectorizer.fit(self.train[name])
        vect_trans_train = vect_fit.transform(self.train[name])
        self.vect_train = vect_trans_train.toarray()
        
        TrainAndPredict(self.train,  self.vect_train,  self.target)
    
    def Count(self, name):
        vectorizer = CountVectorizer(analyzer = clean_text)
        vect_fit = vectorizer.fit(self.train[name])
        vect_trans_train = vect_fit.transform(self.train[name])
        self.vect_train = vect_trans_train.toarray()
        
        TrainAndPredict(self.train, self.vect_train,  self.target)
    
        
        
        
        
        
        

# creating read data class

- > with this class we can read the available data with its name as argument

- > The split function will split the data for traing and testing and return the elements to vectorize class.

In [261]:
class ReadData(Vectorize):
    
    def __init__(self):
        
        print("ReadData instance created")
        
    
    def Type_csv(self, name, target):
        df = pd.read_csv(name)
        self.train = df
        self.target = target            
        Vectorize(self.train, self.target)
        

        
        
        
        
        

In [262]:
data = ReadData()

ReadData instance created


In [263]:
data.Type_csv("airline_sentiment_analysis.csv", "airline_sentiment")

In [264]:
data.Tfidf("text")

In [265]:
data.best_model()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,params,mean_test_score,std_test_score,model_name
0,12.475984,0.907011,0.193291,0.032741,{'alpha': 1.0},0.883979,0.011872,Ridge_classifier
1,12.202497,0.307067,0.146112,0.00587,{'alpha': 1.5},0.881813,0.011493,Ridge_classifier
2,12.857742,0.468528,0.151646,0.008098,{'alpha': 2.0},0.879473,0.011259,Ridge_classifier
3,93.227684,6.519502,0.526797,0.077366,"{'max_depth': None, 'n_estimators': 150}",0.834585,0.100618,Randomforestclassifier
2,6.100982,0.457534,0.130142,0.005027,"{'max_depth': None, 'n_estimators': 10}",0.831814,0.082912,Randomforestclassifier
0,2.993115,0.37746,0.13657,0.027774,"{'max_depth': 30, 'n_estimators': 10}",0.829739,0.01741,Randomforestclassifier
1,32.296767,2.601199,0.259233,0.034627,"{'max_depth': 30, 'n_estimators': 150}",0.815442,0.0147,Randomforestclassifier


In [266]:
data.Count("text")

In [267]:
data.best_model()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,params,mean_test_score,std_test_score,model_name
2,14.028101,0.728224,0.213428,0.032023,{'alpha': 2.0},0.871761,0.020845,Ridge_classifier
1,14.897444,0.537882,0.230222,0.024681,{'alpha': 1.5},0.869508,0.022938,Ridge_classifier
0,14.846,0.664404,0.260569,0.023433,{'alpha': 1.0},0.866562,0.02404,Ridge_classifier
3,73.55045,7.261005,0.498902,0.073047,"{'max_depth': None, 'n_estimators': 150}",0.840391,0.081554,Randomforestclassifier
2,4.884877,0.204913,0.130661,0.013997,"{'max_depth': None, 'n_estimators': 10}",0.830426,0.075567,Randomforestclassifier
0,2.32601,0.189012,0.115662,0.007751,"{'max_depth': 30, 'n_estimators': 10}",0.820814,0.01161,Randomforestclassifier
1,27.185447,0.962311,0.218012,0.00295,"{'max_depth': 30, 'n_estimators': 150}",0.812929,0.014924,Randomforestclassifier
