# Importing the required libraries

In [88]:
import pandas as pd
import numpy as np
import neattext.functions as nfx
import joblib
from sklearn.pipeline import Pipeline 
import string 
import re
import nltk
import time

In [89]:
import joblib
import pickle 

# Importing the models

In [90]:
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import RidgeClassifier 
from sklearn.linear_model import SGDClassifier 
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import LinearRegression

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.svm import SVR

# Importing Preprocessing mathods

In [91]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix as score 

In [92]:
def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [ps.stem(word) for word in tokens if word not in stopwords]
    return text

stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()

# creating the trainpredict class

- > The class contains the models which  can train with data and also predict the test data and give the metrics , all at once.


In [93]:
class TrainAndPredict():
    
    def __init__(self, train, test, vect_train, vect_test, target):
        self.vect_train = vect_train
        self.vect_test = vect_test
        self.target = target
    
    def __predict(self):
        
        start = time.time()
        y_pred =self.model.predict(pd.DataFrame(self.vect_test))
        end = time.time()
        pred_time = (end - start)

        precision, recall= score(self.test[self.target], y_pred)
        
        print(f"""
        
Model_Summary
            Name = {self.Name}
        fit_time = {self.fit_time}
    predict_time = {pred_time}

       Precision = {precision[0]/ (precision[0] + precision[1])}
       Recall    = {recall[1]/ (recall[1] + recall[0])}
       Accuracy  = {(y_pred==self.test[self.target]).sum()/len(y_pred)}

            """)
    
    
        
    def RandomForestClassifier(self):
        
        rf = RandomForestClassifier(n_estimators=150, max_depth=None, n_jobs=-1)
        self.Name = "RandomForest classifier" 

        start = time.time()
        rf_model = rf.fit(pd.DataFrame(self.vect_train), self.train[self.target])
        end = time.time()
        fit_time = (end - start)
        
        self.fit_time = fit_time
        self.model = rf_model
        
        self.__predict()
    
    def GradientBoostingClassifier(self, n_estimators = 150 , max_depth = 11):
        
        gb = GradientBoostingClassifier(n_estimators=150, max_depth=11)
        self.Name  = "GradientBoosting Classifier"

        start = time.time()
        gb_model = gb.fit(pd.DataFrame(self.vect_train), self.train[self.target])
        end = time.time()
        fit_time = (end - start)
        
        self.fit_time = fit_time
        self.model = gb_model
        
        self.__predict() 
        
    def RidgeClassifier(self):
        
        rc = RidgeClassifier()
        self.Name = "Ridge classifier"
        
        start = time.time()
        rc_model = rc.fit(pd.DataFrame(self.vect_train), self.train[self.target])
        end = time.time()
        fit_time = (end - start)
        self.fit_time = fit_time
        self.model = rc_model
        
        self.__predict()
        
        
    

    def saveas_pickle(self):
        pipeline_file = open("airline_sentiment_final.pkl","wb")
        joblib.dump(self.model, pipeline_file)
        pipeline_file.close()
        print("model saved as pickle")
           
        
        

        

# creating Vectorize class

- > The class contains vectorizers like tfidf and count which convert the text into columns which are used for training.

- > the methods in the class use the main text in the data and convert into columns(bag of words) and return it to train_predict class

In [94]:
class Vectorize(TrainAndPredict):
    
    def __init__(self, train, test, target):
        self.train = train
        self.test = test
        self.target = target
        

           
    def Tfidf(self, name):
        vectorizer = TfidfVectorizer(analyzer = clean_text)
        vect_fit = vectorizer.fit(self.train[name])
        vect_trans_train = vect_fit.transform(self.train[name])
        self.vect_train = vect_trans_train.toarray()
        vect_trans_test = vect_fit.transform(self.test[name])
        self.vect_test = vect_trans_test.toarray()
        
        TrainAndPredict(self.train, self.test, self.vect_train, self.vect_test, self.target)
    
    def Count(self, name):
        vectorizer = CountVectorizer(analyzer = clean_text)
        vect_fit = vectorizer.fit(self.train[name])
        vect_trans_train = vect_fit.transform(self.train[name])
        self.vect_train = vect_trans_train.toarray()
        vect_trans_test = vect_fit.transform(self.test[name])
        self.vect_test = vect_trans_test.toarray()
        
        TrainAndPredict(self.train, self.test, self.vect_train, self.vect_test, self.target)
    
        
        
        
        
        
        

# creating read data class

- > with this class we can read the available data with its name as argument

- > The split function will split the data for traing and testing and return the elements to vectorize class.

In [95]:
class ReadData(Vectorize):
    
    def __init__(self):
        
        print("ReadData instance created")
        
    
    def Type_csv(self, name, target):
        df = pd.read_csv(name)
        self.df = df
        self.target = target
    
    def split(self):
        self.train = self.df.iloc[:round(0.7 * len(self.df)),:]
        self.test = self.df.iloc[round(0.7 * len(self.df)):,:]
        
        Vectorize(self.train, self.test, self.target)
        

        
        
        
        
        

In [96]:
data = ReadData()

ReadData instance created


In [97]:
data.Type_csv("airline_sentiment_analysis.csv", "airline_sentiment")

In [98]:
data.split()

In [99]:
data.Tfidf("text")

In [100]:
data.RidgeClassifier()


        
Model_Summary
            Name = Ridge classifier
        fit_time = 9.811229228973389
    predict_time = 0.15795183181762695

       Precision = 0.9838274932614556
       Recall    = 0.6356275303643725
       Accuracy  = 0.9341421143847487

            


In [40]:
data.RandomForestClassifier()


        
Model_Summary
            Name = RandomForest classifier
        fit_time = 30.947122812271118
    predict_time = 0.383197546005249

       Precision = 0.9858490566037735
       Recall    = 0.5566801619433198
       Accuracy  = 0.9246100519930676

            


In [15]:
data.saveas_pickle()

model saved as pickle
