#Header

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import nltk 
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

Mounted at /content/drive


In [None]:
from google.colab import auth 
auth.authenticate_user() 
 
import gspread 
from google.auth import default 
creds, _ = default() 
 
gc = gspread.authorize(creds)

#Metrices

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

def permanceMetrics(Y, pred, pred_prob):
    acc = accuracy_score(Y,pred)
    pre = precision_score(Y,pred,average='weighted')
    re = recall_score(Y,pred, average='weighted')
    f1 = f1_score(Y,pred, average='weighted')
    acrc = roc_auc_score(Y,pred_prob,multi_class= 'ovr')
    return acc, pre, re, f1, acrc

def avgMetric(met):
    res = np.array(met)
    acc = res[::5].mean()
    pre = res[1::5].mean()
    re = res[2::5].mean()
    f1 = res[3::5].mean()
    acrc = res[4::5].mean()
    return np.array([acc, pre, re, f1, acrc])

#Preprocess

In [None]:
import re
def step03(data):
    step03_data = []
    for question in data:
        codeless_question=re.sub('<code>[^>]*</code>', '', question)
        tagless_question=re.sub('<[^>]*>', '', codeless_question)
        step03_data.append(tagless_question)
    return step03_data



In [None]:
def remove_punctuation(data):
  step03_data=step03(data)
  punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
  punctuationless_data = []
  for question in step03_data:
      no_punct = ""
      for char in question:
          if char not in punctuations:
              no_punct = no_punct + char
      punctuationless_data.append(no_punct)
  return punctuationless_data

# print(punctuationless_data[2])

In [None]:
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer


def step01(data):
  snow_stemmer = SnowballStemmer(language='english')

  punctuationless_data=remove_punctuation(data)
  step01_data = []
  for question in punctuationless_data:
    tokenized_question = []
    tokenized_question = word_tokenize(question)
    stem_question = []
    for t_ques in tokenized_question:
      stem_word = snow_stemmer.stem(t_ques)
      stem_question.append(stem_word)
    step01_data.append(stem_question)
  return step01_data

# print(step01_data[2])

In [None]:
f=open('/content/drive/MyDrive/so papers/papers on stack overflow/User Expertise and Question Difficulty/Implementation/Stopwords.txt','r')
stop_words=list(f.read().split("\n"))

In [None]:
def preprocessing(data):
  stop_words2 = nltk.corpus.stopwords.words('english')
  final_stopword_list = stop_words + stop_words2

  step01_data=step01(data)
  step02_data = []
  for question in step01_data:
    filtered_sentence=[]
    for word in question:
      if word not in final_stopword_list:
          filtered_sentence.append(word)
    step02_data.append(filtered_sentence)
  return step02_data 

# print(step02_data[2])

#Tf-Idf


In [None]:
from collections import Counter
import math

def tf_idf(preprocessed):
    term_count=[]
    tf=[]
    #tf calculation
    for word in preprocessed:
        counts = Counter(word)
        count_list = [(k, v) for k, v in counts.items()]
        term_count.append(count_list)
    for doc in term_count:
        total_word=0
        for word in doc:
            total_word=total_word+word[1]
        tf_list=[]
        for word in doc:
            temp=word[1]/ float(total_word)
            tf_list.append((word[0],temp))
        tf.append(tf_list)
    #df calculation
    df=dict()
    for count_list in term_count:
        for word in count_list:
            if word[0] in df:
                df[word[0]]=df[word[0]]+word[1]
            else:
                df[word[0]]=word[1]
    deleting_key=[]
    for k in df.keys():
        if df[k]<3:
            deleting_key.append(k)
    for k in deleting_key:
        del df[k]

    #idf calculation
    idf = dict()
    for word in df:
        idf[word] = math.log(len(term_count) / float(df[word]))
    #tf-idf calculation
    tfidf_list=[]
    for count_list in tf:
        document=[]
        for word_list in count_list:
            word=word_list[0]
            if word_list[0] in idf:
                tfidf=word_list[1]*idf[word_list[0]]
            else:
                tfidf=word_list[1]*0
            tuple=(word,tfidf)
            document.append(tuple)
        tfidf_list.append(document)
    return tfidf_list,df

In [None]:
def vectorizer_tfidf(processed_data):
    tfidf_list,df=tf_idf(processed_data)
    features=[x for x in df.keys()]
    tfidf_vector=[]
    for doc in tfidf_list:
        doc_wise_tfidf=[]
        for word in features:
            for term in doc:
                value=0
                if word==term[0]:
                    value=term[1]
                    break
            doc_wise_tfidf.append(value)
        tfidf_vector.append(doc_wise_tfidf)
    tfidf_vector_df=pd.DataFrame([[y for y in  x] for x in tfidf_vector],columns=features)
    return tfidf_vector_df

#Data Read

In [None]:
import pandas as pd
worksheet1 = gc.open('Final Dataset').get_worksheet(0)
worksheet2 = gc.open('Final Dataset').get_worksheet(1)

rows1 = worksheet1.get_all_values()
rows2 = worksheet2.get_all_values()

stackData1=pd.DataFrame.from_records(rows1,columns=rows1[0])
stackData2=pd.DataFrame.from_records(rows2,columns=rows2[0])

stackData1.drop(0, inplace=True, axis=0)
stackData2.drop(0, inplace=True, axis=0)

stackData=[stackData1, stackData2]
stackData=pd.concat(stackData,ignore_index=True)

stackData['ProcessedBody']=stackData['ProcessedBody']+stackData['Tags']+stackData['Title']


feature=["ProcessedBody", "LOC", "QuestionLength",	"Url+ImageCount",	"Reputation",	"user_badge_bronze_counts",	"user_badge_gold_counts",	"user_badge_silver_counts",	"accept_rate" , "view_count",	"answer_count",	"favorite_count",	"question_score",	"up_vote_count",	"First_answer_Interval","Accept_answer_Interval"]
#---------count------------#
stackData["view_count"] = pd.to_numeric(stackData["view_count"])
stackData["answer_count"] = pd.to_numeric(stackData["answer_count"])
stackData["favorite_count"] = pd.to_numeric(stackData["favorite_count"])
stackData["question_score"] = pd.to_numeric(stackData["question_score"])
stackData["up_vote_count"] = pd.to_numeric(stackData["up_vote_count"])
#---------Data------------,	"First_answer_date",	"Accepted_answer_date"#
stackData["creation_date"] = pd.to_datetime(stackData["creation_date"],unit='s')
stackData["First_answer_date"] = pd.to_datetime(stackData["First_answer_date"],unit='s')
stackData["Accepted_answer_date"] = pd.to_datetime(stackData["Accepted_answer_date"],unit='s')

# print(stackData["creation_date"],stackData["First_answer_date"],stackData["Accepted_answer_date"])
#----------Date Interval ------#
stackData["First_answer_Interval"]=(stackData["First_answer_date"]-stackData["creation_date"])/pd.Timedelta(minutes=1)
stackData["Accept_answer_Interval"]=(stackData["Accepted_answer_date"]-stackData["creation_date"])/pd.Timedelta(minutes=1)

stackData["First_answer_Interval"] = stackData["First_answer_Interval"].apply(lambda x: -1 if x <= 0 else x)
stackData["Accept_answer_Interval"]=stackData["Accept_answer_Interval"].apply(lambda x: -1 if x <= 0 else x)
# print(stackData[feature])
data=stackData[feature].values
y=stackData.Label.values
print(len(data))

1245


#Evaluation

In [None]:
processed_data=preprocessing(data[:,0])
tfidf_vector_df=vectorizer_tfidf(processed_data)
print(tfidf_vector_df.shape)

#for cold start
# other_features=data[:,1:4]
# other_features_df=pd.DataFrame([[x for x in col] for col in other_features],columns=feature[1:])
# print(other_features_df.shape)

# for pre hoc
# other_features=data[:,1:9]
# other_features_df=pd.DataFrame([[x for x in col] for col in other_features],columns=feature[1:])
# print(other_features_df.shape)

# for post hoc
other_features=data[:,1:]
other_features_df=pd.DataFrame([[x for x in col] for col in other_features],columns=feature[1:])
print(other_features_df.shape)

all_features_df=pd.concat([tfidf_vector_df,other_features_df], axis=1)
print(all_features_df.shape)

#Random Forest

In [None]:
# import warnings
# warnings.filterwarnings("ignore", category=DeprecationWarning) 
# warnings.filterwarnings("ignore", category=UserWarning) 
# from sklearn import model_selection
# from sklearn.metrics import classification_report
# from sklearn.model_selection import StratifiedKFold
# from sklearn.ensemble import RandomForestClassifier as RF
# from sklearn.metrics import confusion_matrix

# skf = StratifiedKFold(n_splits=10)

# i=0

# all_features_df=np.array(all_features_df)


# rm = []
# print('Metrics :  Accuracy \t\tPrecision \t\tRecall \t\tF1-score \tAUC_ROC')
# for train_index, test_index in skf.split(all_features_df,y):
#     i=i+1
#     X_train, X_test = all_features_df[train_index], all_features_df[test_index]
#     y_train, y_test = y[train_index], y[test_index]

#     rf = RF(n_estimators = 15, max_depth = 8, criterion='entropy', random_state = 42)
#     rf.fit(X_train,y_train)
#     pred = rf.predict(X_test)
#     pred_prob = rf.predict_proba(X_test)
#     acc, pre, re, f1, arc = permanceMetrics(y_test, pred, pred_prob)
#     print('Fold-',i,': ', acc, pre, re, f1,arc)
#     rm += [acc, pre, re, f1,arc]
#     cm = confusion_matrix(y_test, pred)
#     print(cm)
# acc, pre, re, f1, acrc = avgMetric(rm)
# print('\nAverage: ', acc, pre, re, f1, acrc)

Metrics :  Accuracy 		Precision 		Recall 		F1-score 	AUC_ROC
Fold- 1 :  0.608 0.5260495867768595 0.608 0.47846837606837606 0.6487961327662531
[[ 0  8  2]
 [ 0 74  0]
 [ 0 39  2]]
Fold- 2 :  0.616 0.4968242424242424 0.616 0.5218757763975156 0.6949570416188219
[[ 0  2  8]
 [ 0 72  2]
 [ 0 36  5]]
Fold- 3 :  0.624 0.5120109890109891 0.624 0.5254289127837516 0.7337243253037652
[[ 0  3  7]
 [ 0 73  1]
 [ 0 36  5]]
Fold- 4 :  0.608 0.5998868686868688 0.608 0.5587948852688737 0.7539536413000326
[[ 1  3  6]
 [ 0 65  9]
 [ 0 31 10]]
Fold- 5 :  0.656 0.5927070707070706 0.656 0.6204444444444445 0.766987382823296
[[ 0  5  6]
 [ 0 63 11]
 [ 1 20 19]]
Fold- 6 :  0.6451612903225806 0.5778125396355641 0.6451612903225806 0.5619799555283427 0.8227283085047868
[[ 0  5  5]
 [ 0 72  1]
 [ 1 32  8]]
Fold- 7 :  0.6854838709677419 0.6714905870480565 0.6854838709677419 0.6330966566238572 0.8868149021332129
[[ 1  5  4]
 [ 1 71  1]
 [ 1 27 13]]
Fold- 8 :  0.6451612903225806 0.5782925054571914 0.6451612903225806 

#XG Boost

In [None]:
# from xgboost import XGBClassifier
# from sklearn import model_selection
# from sklearn.metrics import classification_report
# from sklearn.model_selection import StratifiedKFold
# from sklearn.metrics import confusion_matrix

# # import scikitplot as skplt
# import matplotlib.pyplot as plt
# import warnings
# warnings.filterwarnings("ignore", category=DeprecationWarning) 
# warnings.filterwarnings("ignore", category=UserWarning) 

# kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# i=0

# all_features_df=np.array(all_features_df)


# rm = []
# print('Metrics :  Accuracy \t\tPrecision \t\tRecall \t\tF1-score \tAUC_ROC')
# for train_index, test_index in kfold.split(all_features_df,y):
#     i=i+1
#     X_train, X_test = all_features_df[train_index], all_features_df[test_index]
#     y_train, y_test = y[train_index], y[test_index]

#     my_model = XGBClassifier(n_estimators=40,learning_rate=0.05, max_depth=8)
#     my_model.fit(X_train, y_train)
#     pred = my_model.predict(X_test)
#     pred_prob = my_model.predict_proba(X_test)


#     acc, pre, re, f1, arc = permanceMetrics(y_test, pred, pred_prob)
#     print('Fold-',i,': ', acc, pre, re, f1,arc)
#     rm += [acc, pre, re, f1,arc]
#     cm = confusion_matrix(y_test, pred)
#     print(cm)
# acc, pre, re, f1, acrc = avgMetric(rm)
# print('\nAverage: ', acc, pre, re, f1, acrc)

Metrics :  Accuracy 		Precision 		Recall 		F1-score 	AUC_ROC
Fold- 1 :  0.672 0.6430914927768859 0.672 0.6476796775436412 0.7454489514510723
[[ 1  3  6]
 [ 1 67  6]
 [ 6 19 16]]
Fold- 2 :  0.664 0.6377904761904762 0.664 0.6371123711968784 0.7785891540417147
[[ 2  3  5]
 [ 0 65  9]
 [ 2 23 16]]
Fold- 3 :  0.688 0.6969290322580645 0.688 0.6609046695341712 0.7791702386150643
[[ 2  4  4]
 [ 0 66  8]
 [ 0 23 18]]
Fold- 4 :  0.616 0.5575241502683363 0.616 0.5842 0.7202983347285011
[[ 0  4  6]
 [ 0 59 15]
 [ 0 23 18]]
Fold- 5 :  0.656 0.6411594202898552 0.656 0.6279104991394148 0.7252344252499051
[[ 2  4  5]
 [ 1 64  9]
 [ 0 24 16]]
Fold- 6 :  0.6451612903225806 0.6196998447472831 0.6451612903225806 0.6266526326244579 0.7283859869408221
[[ 2  3  5]
 [ 1 61 11]
 [ 3 21 17]]
Fold- 7 :  0.6774193548387096 0.6380408586029566 0.6774193548387096 0.641662638436832 0.7859111346955533
[[ 1  1  8]
 [ 1 67  5]
 [ 1 24 16]]
Fold- 8 :  0.6854838709677419 0.6702812980030722 0.6854838709677419 0.66611861741

#ADA Boost

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix

# import scikitplot as skplt
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 
warnings.filterwarnings("ignore", category=UserWarning) 

kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

i=0

all_features_df=np.array(all_features_df)


rm = []
print('Metrics :  Accuracy \t\tPrecision \t\tRecall \t\tF1-score \tAUC_ROC')
for train_index, test_index in kfold.split(all_features_df,y):
    i=i+1
    X_train, X_test = all_features_df[train_index], all_features_df[test_index]
    y_train, y_test = y[train_index], y[test_index]

    my_model = AdaBoostClassifier(n_estimators=1000,learning_rate=0.05)
    my_model.fit(X_train, y_train)
    pred = my_model.predict(X_test)
    pred_prob = my_model.predict_proba(X_test)

    
    acc, pre, re, f1, arc = permanceMetrics(y_test, pred, pred_prob)
    print('Fold-',i,': ', acc, pre, re, f1,arc)
    rm += [acc, pre, re, f1,arc]
    cm = confusion_matrix(y_test, pred)
    print(cm)
acc, pre, re, f1, acrc = avgMetric(rm)
print('\nAverage: ', acc, pre, re, f1, acrc)

#SVM

In [None]:
from sklearn import svm
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix

# import scikitplot as skplt
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 
warnings.filterwarnings("ignore", category=UserWarning) 

kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

i=0

all_features_df=np.array(all_features_df)


rm = []
print('Metrics :  Accuracy \t\tPrecision \t\tRecall \t\tF1-score \tAUC_ROC')
for train_index, test_index in kfold.split(all_features_df,y):
    i=i+1
    X_train, X_test = all_features_df[train_index], all_features_df[test_index]
    y_train, y_test = y[train_index], y[test_index]

    my_model = svm.SVC(decision_function_shape='ovr',probability=True)
    my_model.fit(X_train, y_train)
    pred = my_model.predict(X_test)
    pred_prob = my_model.predict_proba(X_test)

    
    acc, pre, re, f1, arc = permanceMetrics(y_test, pred, pred_prob)
    print('Fold-',i,': ', acc, pre, re, f1,arc)
    rm += [acc, pre, re, f1,arc]
    cm = confusion_matrix(y_test, pred)
    print(cm)
acc, pre, re, f1, acrc = avgMetric(rm)
print('\nAverage: ', acc, pre, re, f1, acrc)