In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re
import os
import string
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
from sklearn.metrics import explained_variance_score
from sklearn.metrics import precision_score, recall_score, f1_score, mean_squared_error
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.svm import SVC
import pickle

In [2]:
dataset = pd.read_csv("finalData.csv")
dataset = dataset[dataset['civic_issue']==1]
dataset = dataset[['description','category']]
dataset.drop_duplicates(subset='description',inplace=True,keep=False)
dataset.count()

description    15234
category       15234
dtype: int64

 ## Preprocessing the Description 
 
 The preprocessing is done in 4 steps:

    - removing punctuation
    - removing stopwords like 'the', 'this','as',etc
    - conversion of the entire text to lower case
    - Stemming: reducing the number of inflectional forms of words by reducing all to their common stem.For example, 'argue','arguing','argued' are all reduced to 'argu'
    - Splitting dataset into train and cross validation sets

In [4]:
def preprocess():
    stemmer = PorterStemmer()
    words = stopwords.words("english")
    dataset['processedtext'] = dataset['description'].apply(lambda x: " ".join([stemmer.stem(i) for i in re.sub("[^a-zA-Z]", " ", x).split() if i not in words]).lower())

preprocess()
data = dataset[['processedtext','category']]

In [5]:
X_train, X_test, y_train, y_test = train_test_split(data['processedtext'],data['category'], test_size=0.20, random_state=42)

### Defining functions to calculate model metrics

In [6]:
def auc_roc(classifier):
    pred_probs = classifier.predict_proba(test_tfidf)
    train_probs = classifier.predict_proba(train_tfidf)[:,1]
    noSkillProb = [0 for _ in range(len(y_test))]
    lr_probs = pred_probs[:,1]

    noSkillAUC = roc_auc_score(y_test,noSkillProb)
    logRegAUC = roc_auc_score(y_test,lr_probs)

    print('No Skill: ROC AUC=%.3f' % (noSkillAUC))
    print('Logistic: ROC AUC=%.3f' % (logRegAUC))

    #FPR,TPR,thresholds = roc_curve(y_test,)
    ns_fpr, ns_tpr, _ = roc_curve(y_test, noSkillProb)
    lr_fpr, lr_tpr, _ = roc_curve(y_test, lr_probs)
    t_fpr,t_tpr,_ = roc_curve(y_train,train_probs)
    # plot the roc curve for the model
    plt.plot(ns_fpr, ns_tpr, linestyle='--', label='No Skill')
    plt.plot(lr_fpr, lr_tpr, marker='.', label='Logistic')
    plt.plot(t_fpr,t_tpr,marker='*',label='Training')
    # axis labels
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    # show the legend
    plt.legend()
    # show the plot
    plt.show()

In [7]:

def model_metrics(classifier,y_test,pred,score):
    print("Accuracy:", score*100, "%")
    # print("Precision:",precision_score(y_test,pred)*100,"%")
    # print("Recall:",recall_score(y_test,pred)*100,"%")
    # print("F1 Score:",f1_score(y_test,pred)*100,"%")
    # print("MSE:",mean_squared_error(y_test,pred)*100,"%")
    # print("Explained Variance Regression Score:", explained_variance_score(y_test,pred))
    # auc_roc(classifier)

## Training the Linear SVC Model and Cross Validation

In [8]:
def train_SVC(train_tfIdf, y_train):
    #building text classification model using Linear Kernel SVC Classifier (has highest accuracy)
    classifier = SVC(kernel='linear') #accuracy obtained for linear kernel = 83.28%
    classifier.fit(train_tfIdf, y_train) #fitting the classifier onto the training data
    filename = "linearkernelSVC.sav"
    pickle.dump(classifier,open(filename,"wb"))

def predict_cat():  
    # X_train: description data for training
    # y_train: corresponding categories for training
    # X_test and y_test: description and category for testing
    
    # Vectorizing the train and test data using TfIDf vectorization
    # TfIdf - Text Frequency Inverse Document Freqeuncy : vectorizes based on frequency across the current text document but less frequency across multiple documents

    vectorizer_tfidf = TfidfVectorizer(stop_words='english', max_df=0.7) #initializing the vector
    train_tfIdf = vectorizer_tfidf.fit_transform(X_train.values.astype('U')) #astype('U') converts the dataframe into a Unicode array
    test_tfIdf = vectorizer_tfidf.transform(X_test.values.astype('U'))#transforming the text into frequency vectors
    
    # train_SVC(train_tfIdf, y_train)

    classifier = pickle.load(open("linearkernelSVC.sav","rb"))
    predictions = classifier.predict(test_tfIdf) #predictions made on the unseen data
    train_score = classifier.score(train_tfIdf, y_train)
    print("\n\nTrain Accuracy:",train_score*100,"%\n\n")
    score = classifier.score(test_tfIdf,y_test)
    model_metrics(classifier,y_test,predictions,score)

predict_cat()



Train Accuracy: 92.64790350373349 %


Accuracy: 85.16573679028554 %
