# Linear SVM Model for model analysis

In [1]:
import pandas as pd
import numpy as np
import re
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from wordcloud import WordCloud
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import f1_score,precision_score,recall_score

In [3]:
%%time
## sample_500k is a sample from main dataset 
preprocess_data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Stack overflow Tag /preprocessed_3title_100k.csv")
preprocess_data.head()

CPU times: user 1.02 s, sys: 243 ms, total: 1.26 s
Wall time: 3.53 s


In [4]:
preprocess_data.head()

Unnamed: 0.1,Unnamed: 0,question,code,tags,word_count_before,word_count_after,is_code
0,0,gvim window path issu gvim window path issu gv...,"['\\...\\gvim\\vim73\n', '\\...\\gvim\n', 'vim...",windows-7 windows-xp vim gvim,749,341,1
1,1,pass quotient via quotient map preserv topolog...,[],general-topology,701,446,0
2,2,jqueri cycl function paramet jqueri cycl funct...,"['$container.cycle(i, manualEffects[i]);', '$(...",jquery jquery-cycle,1083,211,1
3,3,updat problem listbox form fed queri updat pro...,[],ms-access,368,223,0
4,4,synergi conf listen synergi conf listen synerg...,['section: screens\n Roel-Desktop:\n ...,synergy,937,398,1


In [5]:
def text_splitter(text):
    return text.split()

In [6]:
# binary='true' will give a binary vectorizer
tag_vectorizer = CountVectorizer(tokenizer = text_splitter, binary=True)
multi_label_y = tag_vectorizer.fit_transform(preprocess_data['tags'].values.astype(str))

In [7]:
# make sum column wise
tag_column_sum = multi_label_y.sum(axis=0).tolist()[0]

In [8]:
# To select n number of top tags
def select_top_tags(n):
    
    # To get sotred list (means: tags appear in maximum number of questions come first)
    # top 10: [3711, 15246, 22934, 15324, 1054, 15713, 3720, 24481, 14905, 1897]
    sorted_tags = sorted(range(len(tag_column_sum)), key=lambda i: tag_column_sum[i], reverse=True)
    
    # With this line of code we get tags in our columns which are come in most of the questions
    # we will get shape: (999999, n)
    multi_label_n_y = multi_label_y[:,sorted_tags[:n]]
    return multi_label_n_y
# 
def questions_covered_fn(n):
    multi_label_n_y = select_top_tags(n)
    
    # This line will give us row wise sum of each row  [[1, 2],           [[3],
    #                                                   [4, 3]]     to     [7]]
    row_sum_array = multi_label_n_y.sum(axis=1)

    # Counts the number of non-zero values in the array
    return (np.count_nonzero(row_sum_array==0))
# With this code we checking how much percent questions are explained by how many tags
# Here we are starting from 500 because we think top 500 are most important tags we can't skip them
questions_covered=[]
total_tags=multi_label_y.shape[1]
total_qs=preprocess_data.shape[0]
for i in range(500, total_tags, 100):
    questions_covered.append(np.round(((total_qs-questions_covered_fn(i))/total_qs)*100,3))

In [9]:
multi_label_n_y = select_top_tags(500)
print("number of questions that are not covered :", questions_covered_fn(5500),"out of ", total_qs)

number of questions that are not covered : 896 out of  100000


In [10]:
print("Number of tags in sample :", multi_label_y.shape[1])
print("number of tags taken :", multi_label_n_y.shape[1],"-->",round((multi_label_n_y.shape[1]/multi_label_y.shape[1]),3)*100,"%")

Number of tags in sample : 18646
number of tags taken : 500 --> 2.7 %


In [11]:
total_size=preprocess_data.shape[0]
train_size=int(0.80*total_size)

x_train=preprocess_data.head(train_size)
x_test=preprocess_data.tail(total_size - train_size)

y_train = multi_label_n_y[0:train_size,:]
y_test = multi_label_n_y[train_size:total_size,:]

In [33]:
%%time
# To get new features with tfidf technique get 200000 features with upto 3-grams
vectorizer = TfidfVectorizer(min_df=0.00009, max_features=200000, smooth_idf=True, norm="l2", tokenizer = text_splitter, sublinear_tf=False, ngram_range=(1,3))
# Apply this vectorizer only on question data column
x_train_multi_label = vectorizer.fit_transform(x_train['question'])
x_test_multi_label = vectorizer.transform(x_test['question'])

CPU times: user 53.6 s, sys: 2.43 s, total: 56 s
Wall time: 55.9 s


In [34]:
# Now check data shapes after featurization
print("Dimensions of train data X:",x_train_multi_label.shape, "Y :",y_train.shape)
print("Dimensions of test data X:",x_test_multi_label.shape,"Y:",y_test.shape)

Dimensions of train data X: (80000, 91388) Y : (80000, 500)
Dimensions of test data X: (20000, 91388) Y: (20000, 500)


In [35]:
from joblib import dump
dump(vectorizer, '/content/drive/MyDrive/Colab Notebooks/Stack overflow Tag /stackoverflow_tfidf_vectorizer_liner_svm__4grams_100k.pkl')

['/content/drive/MyDrive/Colab Notebooks/Stack overflow Tag /stackoverflow_tfidf_vectorizer_liner_svm__4grams_100k.pkl']

In [36]:
classifier = OneVsRestClassifier(SGDClassifier(loss='hinge', alpha=0.00001, penalty='l1'), n_jobs=-1)

In [37]:
import time
start = time.time()
classifier.fit(x_train_multi_label, y_train)
print("Time it takes to run this :",(time.time()-start)/60,"minutes")

Time it takes to run this : 2.943255400657654 minutes


In [38]:
dump(classifier, '/content/drive/MyDrive/Colab Notebooks/Stack overflow Tag /stackoverflow_model_liner_svm_4grams_100k.pkl')

['/content/drive/MyDrive/Colab Notebooks/Stack overflow Tag /stackoverflow_model_liner_svm_4grams_100k.pkl']

In [39]:
predictions = classifier.predict(x_test_multi_label)
print("accuracy :",metrics.accuracy_score(y_test,predictions))
print("macro f1 score :",metrics.f1_score(y_test, predictions, average = 'macro'))
print("micro f1 scoore :",metrics.f1_score(y_test, predictions, average = 'micro'))
print("hamming loss :",metrics.hamming_loss(y_test,predictions))

accuracy : 0.2473
macro f1 score : 0.29326674885905196
micro f1 scoore : 0.47650955021565006
hamming loss : 0.0027188


In [40]:
report = metrics.classification_report(y_test, predictions, output_dict=True)
report_df = pd.DataFrame(report).transpose()
report_df.to_csv("/content/report_liner_svm_100k.csv")