In [1]:
"""
Description : Complete model creation process as a single entity for passive aggresive classifier


"""

'\nDescription : Complete model creation process as a single entity for passive aggresive classifier\n\n\n'

In [2]:
#importing libraries

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import PassiveAggressiveClassifier


In [3]:
def pac_tfidf_model(x,y,stop=False):
    
    """
    Input  : x=Attributes of the dataset, y=Target Attribute
    Output : Print model details
    
    """
    train_x,train_y,test_x,test_y=split_data(x,y)
    
    tfidf_train_x,tfidf_test_x=tfidf_vect(train_x,test_x)
    
    passive_aggressive(tfidf_train_x,train_y,tfidf_test_x,test_y,stop)

In [4]:
def pac_hash_model(x,y,stop=False):
    
    """
    Input  : x=Attributes of the dataset, y=Target Attribute
    Output : Print model details
    
    """
    train_x,train_y,test_x,test_y=split_data(x,y)
    
    tfidf_train_x,tfidf_test_x=hash_vect(train_x,test_x)
    
    passive_aggressive(tfidf_train_x,train_y,tfidf_test_x,test_y,stop)

In [5]:
def split_data(x,y):
    
    """
    Reminder : Clean the set before sending here.
    Input  : x=Attributes of the dataset, y=Target Attribute
    Output : Split data of train test
    
    We are removing dev , as dev can be better used for the models that includes DNN, CNN and RNN
    
    """
    X=x
    Y=y
    
    train_x,test_x,train_y,test_y = train_test_split(x,y,test_size=0.2,random_state=12)
    
    #train_x : Training data with attributes
    #train_y : Training data with ratings
    #test_x  : testing data with attributes
    #test_y  : testing data with rating 
    
    print("Final size of train/test :",train_x.size,"/",test_x.size)
    return train_x,train_y,test_x,test_y

In [6]:
def tfidf_vect(train_x,test_x):
    
    """
    Input : train and text attributes needed to be vectorized
    Output : Vectorized versions of the input
    
    """
    tfidf_vectorizer = TfidfVectorizer(stop_words='english',max_df=0.7)   # a TFIDF vectorizer
    tfidf_train_x = tfidf_vectorizer.fit_transform(train_x)  #fitting the training data
    tfidf_test_x = tfidf_vectorizer.transform(test_x)  #fitting the testing data
    return tfidf_train_x,tfidf_test_x    #returning a vectorized train and test data


In [7]:
def hash_vect(train_x,test_x):  
    """
    Input : train and text attributes needed to be vectorized
    Output : Vectorized versions of the input
    
    """
    hash_vectorizer = HashingVectorizer(stop_words='english')  #creating a hashing vectorizer
    hash_train_x = np.absolute(hash_vectorizer.fit_transform(train_x)) #fitting the training data
    hash_test_x = np.absolute(hash_vectorizer.transform(test_x)) #fitting the testing data
    return hash_train_x,hash_test_x  #returning a vectorized train and test data

In [8]:
def passive_aggressive(train_x,train_y,test_x,test_y,stop=False):
    
    if stop==True:
        print("Early Stopping is being used")
    
    pac= PassiveAggressiveClassifier(early_stopping=stop)
    pac.fit(train_x, train_y)
    predicted_pac_train=pac.predict(train_x)
    predicted_pac_test= pac.predict(test_x)
    
    score_train= metrics.accuracy_score(train_y, predicted_pac_train)
    score_test=metrics.accuracy_score(test_y,predicted_pac_test)
    
    print("train accuracy:",score_train)
    cm_pac_train= metrics.confusion_matrix(train_y, predicted_pac_train)
    print("train confusion matrix")
    print(cm_pac_train)
    print("test accuracy:",score_test)
    cm_pac_test= metrics.confusion_matrix(test_y, predicted_pac_test)
    print("test confusion matrix")
    print(cm_pac_test)