# Importing the required libraries

In [57]:
import pandas as pd
import numpy as np
import neattext.functions as nfx
import joblib
from sklearn.pipeline import Pipeline 
import string 
import re
import nltk
import time

In [58]:
import joblib
import pickle 

# Importing the models

In [59]:
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import RidgeClassifier 
from sklearn.linear_model import SGDClassifier 
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import LinearRegression

from sklearn.naive_bayes import MultinomialNB

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.svm import SVR

# Importing Preprocessing mathods

In [60]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix as score 
from sklearn.model_selection import GridSearchCV

In [61]:
def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [ps.stem(word) for word in tokens if word not in stopwords]
    return text

stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()

# creating the trainpredict class

- > The class contains the models which  can train with data and also predict the test data and give the metrics , all at once.


In [62]:
class BestModel():
    
    def __init__(self):
        print("finding the best model")
    
    def best_model(self):
        
        r1 = TrainAndPredict.Ridge_Classifier(self)
        r2 = TrainAndPredict.RandomForest_Classifier(self)
        r3 = TrainAndPredict.sgd_classifier(self)
        r4 = TrainAndPredict.multinomial_NB(self)

        frames = [r1, r2, r3, r4]
        df = pd.concat(frames)
        return df.sort_values('mean_test_score', ascending=False)
        
        

In [78]:
class TrainAndPredict(BestModel):
    
    def __init__(self, train, vect_train,  target):
        self.vect_train = vect_train
        self.target = target
        self.train = train
        
    def prepare_data(self):
        data = pd.concat([self.train['body_len'], self.train['punct%'], pd.DataFrame(self.vect_train)], axis=1)  
        
        return data
    
    
        
    def RandomForest_Classifier(self):
        
        rf = RandomForestClassifier()
        param = {'n_estimators': [10, 150],
                'max_depth': [30, None]}
        model_name = ["Randomforestclassifier"] * 4
        gs = GridSearchCV(rf, param, cv=5)
        data = pd.concat([self.train['body_len'], self.train['punct%'], pd.DataFrame(self.vect_train)], axis=1)
        gs_fit = gs.fit(data, self.train[self.target])
        
        results = pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score', ascending=False)
        results["model_name"] = model_name 
        results = results.drop(["split0_test_score", "split1_test_score", "split2_test_score", "split3_test_score", "split4_test_score", "param_max_depth", "param_n_estimators", "rank_test_score"], axis = 1)
        return results 
        
    def Ridge_Classifier(self):
        
        rc = RidgeClassifier(random_state = 42)
        param = {"alpha" : [0.8, 0.9, 1.0]}
        gs = GridSearchCV(rc, param, cv = 4)
        model_name = ["Ridge_classifier"] * 3
        data = pd.concat([self.train['body_len'], self.train['punct%'], pd.DataFrame(self.vect_train)], axis=1)
        gs_fit = gs.fit(data_p, self.train[self.target])
        
        results = pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score', ascending=False)
        results["model_name"] = model_name
        results =  results.drop(["split0_test_score", "split1_test_score", "split2_test_score", "split3_test_score", "param_alpha", "rank_test_score"], axis = 1)
        return results
    
    
    def sgd_classifier(self):
        
        sg = SGDClassifier(random_state = 42)
        param = {"alpha" : [0.0001, 0.000095, 0.00008]}
        gs = GridSearchCV(sg, param, cv = 4)
        data = pd.concat([self.train['body_len'], self.train['punct%'], pd.DataFrame(self.vect_train)], axis=1)
        gs_fit = gs.fit(data, self.train[self.target])
        
        model_name = ["SGDClassifier"] * 3
        results = pd.DataFrame(gs_fit.cv_results_).sort_values("mean_test_score", ascending = False)
        results["model_name"] = model_name
        results =  results.drop(["split0_test_score", "split1_test_score", "split2_test_score", "split3_test_score", "param_alpha", "rank_test_score"], axis = 1)
        return results
    
    def multinomial_NB(self):
        
        mul_nb = MultinomialNB()
        param = {"alpha" : [0.5, 0.55, 0.6,0.65,  0.7, 0.75, 0.8, 0.85 , 0.9,0.95,  1.0, 1.2 ,1.4, 1.6]}
        gs = GridSearchCV(mul_nb, param, cv = 4)
        data = pd.concat([self.train['body_len'], self.train['punct%'], pd.DataFrame(self.vect_train)], axis=1)
        gs_fit = gs.fit(data, self.train[self.target])
        
        model_name = ["MultinomialNB"] * 14
        results = pd.DataFrame(gs_fit.cv_results_).sort_values("mean_test_score", ascending = False)
        results["model_name"] = model_name
        results =  results.drop(["split0_test_score", "split1_test_score", "split2_test_score", "split3_test_score", "param_alpha", "rank_test_score"], axis = 1)
        return results
        
        
        
   


    def saveas_pickle(self):
        pipeline_file = open("airline_sentiment_final.pkl","wb")
        joblib.dump(self.model, pipeline_file)
        pipeline_file.close()
        print("model saved as pickle")
           
        
        

        

# creating Vectorize class

- > The class contains vectorizers like tfidf and count which convert the text into columns which are used for training.

- > the methods in the class use the main text in the data and convert into columns(bag of words) and return it to train_predict class

In [79]:
class Vectorize(TrainAndPredict):
    
    def __init__(self, train,  target):
        self.train = train
        self.target = target
        

           
    def Tfidf(self, name):
        vectorizer = TfidfVectorizer(analyzer = clean_text)
        vect_fit = vectorizer.fit(self.train[name])
        vect_trans_train = vect_fit.transform(self.train[name])
        self.vect_train = vect_trans_train.toarray()
        
        TrainAndPredict(self.train,  self.vect_train,  self.target)
    
    def Count(self, name):
        vectorizer = CountVectorizer(analyzer = clean_text)
        vect_fit = vectorizer.fit(self.train[name])
        vect_trans_train = vect_fit.transform(self.train[name])
        self.vect_train = vect_trans_train.toarray()
        
        TrainAndPredict(self.train, self.vect_train,  self.target)
    
    
        
        
        
        

# creating read data class

- > with this class we can read the available data with its name as argument

- > The split function will split the data for traing and testing and return the elements to vectorize class.

In [80]:
class ReadData(Vectorize):
    
    def __init__(self):
        
        print("ReadData instance created")
        
    
    def Type_csv(self, name, target):
        df = pd.read_csv(name)
        self.train = df
        self.target = target            
        Vectorize(self.train, self.target)
        

        
        
        
        
        

In [81]:
data = ReadData()

ReadData instance created


In [82]:
data.Type_csv("data_modified.csv", "airline_sentiment")

In [83]:
data.Tfidf("text")

In [84]:
data.Ridge_Classifier()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,params,mean_test_score,std_test_score,model_name
1,16.870349,1.598588,0.416317,0.05858,{'alpha': 0.9},0.887878,0.010712,Ridge_classifier
2,16.428479,0.259583,0.442975,0.053576,{'alpha': 1.0},0.887532,0.011108,Ridge_classifier
0,14.928218,0.534646,0.344636,0.018583,{'alpha': 0.8},0.887098,0.010756,Ridge_classifier


In [13]:
data.best_model()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,params,mean_test_score,std_test_score,model_name
1,75.238871,7.649687,1.116081,0.39617,{'alpha': 0.9},0.887878,0.010712,Ridge_classifier
2,32.970682,19.089618,0.537239,0.069481,{'alpha': 1.0},0.887532,0.011108,Ridge_classifier
0,59.446653,4.041303,0.9257,0.067038,{'alpha': 0.8},0.887098,0.010756,Ridge_classifier
2,4.578112,0.22842,0.208683,0.01561,"{'max_depth': None, 'n_estimators': 10}",0.842732,0.049028,Randomforestclassifier
1,33.682326,6.913221,0.231637,0.023633,{'alpha': 9.5e-05},0.8384,0.021505,SGDClassifier
3,56.504217,3.619727,0.524913,0.152613,"{'max_depth': None, 'n_estimators': 150}",0.833893,0.096404,Randomforestclassifier
0,1.45753,0.724146,0.263,0.05804,{'alpha': 0.5},0.830604,0.016622,MultinomialNB
2,40.424593,12.100794,0.216021,0.011447,{'alpha': 8e-05},0.830603,0.014611,SGDClassifier
1,1.020614,0.078886,0.26059,0.046574,{'alpha': 0.55},0.828785,0.01636,MultinomialNB
2,0.982741,0.044028,0.233931,0.009231,{'alpha': 0.6},0.828005,0.015169,MultinomialNB
