# Importing the required libraries

In [None]:
import pandas as pd
import numpy as np
import neattext.functions as nfx
import joblib
from sklearn.pipeline import Pipeline 
import string 
import re
import nltk
import time

# Imporing the models

In [27]:
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import RidgeClassifier 
from sklearn.linear_model import SGDClassifier 
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import LinearRegression

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.svm import SVR

# Importing Preprocessing mathods

In [28]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix as score 

In [29]:
def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [ps.stem(word) for word in tokens if word not in stopwords]
    return text

stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()

# creating the trainpredict class

- > The class contains the models which  can train with data and also predict the test data and give the metrics , all at once.


In [30]:
class TrainAndPredict:
    
    def __init__(self, train, test, vect_train, vect_test, target):
        self.train = train
        self.test = test
        self.vect_train = vect_train
        self.vect_test = vect_test
        self.target = target
        
    def RandomForestClassifier(self):
        
        rf = RandomForestClassifier(n_estimators=150, max_depth=None, n_jobs=-1)

        start = time.time()
        rf_model = rf.fit(pd.DataFrame(self.vect_train), self.train[self.target])
        end = time.time()
        fit_time = (end - start)
        
        start = time.time()
        y_pred = rf_model.predict(pd.DataFrame(self.vect_test))
        end = time.time()
        pred_time = (end - start)

        precision, recall= score(self.test[self.target], y_pred)
        print('Fit time: {} / Predict time: {} ---- Precision: {} / Recall: {} / Accuracy: {}'.format(
            fit_time, pred_time, precision[0]/ (precision[0] + precision[1]), recall[1]/ (recall[1] + recall[0]), (y_pred==self.test[self.target]).sum()/len(y_pred)))
    
    def GradientBoostingClassifier(self, n_estimators = 150 , max_depth = 11):
        
        gb = GradientBoostingClassifier(n_estimators=150, max_depth=11)

        start = time.time()
        gb_model = gb.fit(pd.DataFrame(self.vect_train), self.train[self.target])
        end = time.time()
        fit_time = (end - start)
        
        start = time.time()
        y_pred = gb_model.predict(pd.DataFrame(self.vect_test))
        end = time.time()
        pred_time = (end - start)

        precision, recall= score(self.test[self.target], y_pred)
        print('Fit time: {} / Predict time: {} ---- Precision: {} / Recall: {} / Accuracy: {}'.format(
            fit_time, pred_time, precision[0]/ (precision[0] + precision[1]), recall[1]/ (recall[1] + recall[0]), (y_pred==self.test[self.target]).sum()/len(y_pred)))
    
    def RidgeClassifier(self):
        
        rc = RidgeClassifier()
        
        start = time.time()
        rc_model = rc.fit(pd.DataFrame(self.vect_train), self.train[self.target])
        end = time.time()
        fit_time = (end - start)
        
        start = time.time()
        y_pred = rc_model.predict(pd.DataFrame(self.vect_test))
        end = time.time()
        pred_time = (end - start)

        precision, recall= score(self.test[self.target], y_pred)
        print('Fit time: {} / Predict time: {} ---- Precision: {} / Recall: {} / Accuracy: {}'.format(
            fit_time, pred_time, precision[0]/ (precision[0] + precision[1]), recall[1]/ (recall[1] + recall[0]), (y_pred==self.test[self.target]).sum()/len(y_pred)))
    
            
        
        

        

# creating Vecrtorize class

- > The class contains vectorizers like tfidf and count which convert the text into columns which are used for training.

- > the methods in the class use the main text in the data and convert into columns(bag of words) and return it to train_predict class

In [31]:
class Vectorize(TrainAndPredict):
    
    def __init__(self, train, test, target):
        self.train = train
        self.test = test
        self.target = target
        self.vect_train = None
        self.vect_test = None

           
    def Tfidf(self, name):
        vectorizer = TfidfVectorizer(analyzer = clean_text)
        vect_fit = vectorizer.fit(self.train[name])
        vect_trans_train = vect_fit.transform(self.train[name])
        self.vect_train = vect_trans_train.toarray()
        vect_trans_test = vect_fit.transform(self.test[name])
        self.vect_test = vect_trans_test.toarray()
        
        TrainAndPredict(self.train, self.test, self.vect_train, self.vect_test, self.target)
    
    def Count(self, name):
        vectorizer = CountVectorizer(analyzer = clean_text)
        vect_fit = vectorizer.fit(self.train[name])
        vect_trans_train = vect_fit.transform(self.train[name])
        self.vect_train = vect_trans_train.toarray()
        vect_trans_test = vect_fit.transform(self.test[name])
        self.vect_test = vect_trans_test.toarray()
        
        TrainAndPredict(self.train, self.test, self.vect_train, self.vect_test, self.target)
    
        
        
        
        
        
        

# creating read data class

- > with this class we can read the available data with its name as argument

- > The split function will split the data for traing and testing

In [32]:
class ReadData(Vectorize):
    
    def __init__(self):
        print("ReadData instance created")
        self.df  = None
        self.train = None
        self.test = None
        self.target = None
    
    def Type_csv(self, name, target):
        df = pd.read_csv(name)
        self.df = df
        self.target = target
    
    def split(self):
        self.train = self.df.iloc[:round(0.7 * len(self.df)),:]
        self.test = self.df.iloc[round(0.7 * len(self.df)):,:]
        
        Vectorize(self.train, self.test, self.target)
        
        
        
        
        
        

In [33]:
data = ReadData()

ReadData instance created


In [34]:
data.Type_csv("airline_sentiment_analysis.csv", "airline_sentiment")

In [35]:
data.split()

In [36]:
data.Tfidf("text")

In [23]:
data.RandomForestClassifier()

Fit time: 31.732563495635986 / Predict time: 0.5719819068908691 ---- Precision: 0.9787735849056604 / Recall: 0.597165991902834 / Accuracy: 0.9243212016175621


In [37]:
data.RidgeClassifier()

Fit time: 9.735397577285767 / Predict time: 0.1444110870361328 ---- Precision: 0.9838274932614556 / Recall: 0.6356275303643725 / Accuracy: 0.9341421143847487
