###header

In [None]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


import nltk 
nltk.download('stopwords')

In [None]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

In [None]:
!pip install pyldavis

In [None]:
import pyLDAvis
import pyLDAvis.gensim_models
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import spacy

In [None]:
from google.colab import auth 
auth.authenticate_user() 
 
import gspread 
from google.auth import default 
creds, _ = default() 
 
gc = gspread.authorize(creds)

In [None]:
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
stop_words = nltk.corpus.stopwords.words('english')

### Preprocessing

In [None]:

def process_words(texts, stop_words=stop_words, allowed_tags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    bigram = gensim.models.Phrases(texts, min_count=20, threshold=100)
    trigram = gensim.models.Phrases(bigram[texts], threshold=100)
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    trigram_mod = gensim.models.phrases.Phraser(trigram)
    texts = [[word for word in simple_preprocess(str(doc), deacc=True, min_len=3) if word not in stop_words] for doc in texts]
    texts = [bigram_mod[doc] for doc in texts]
    texts = [trigram_mod[bigram_mod[doc]] for doc in texts]
    
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_tags])
    
    texts_out = [[word for word in simple_preprocess(str(doc), deacc=True, min_len=3) if word not in stop_words] for doc in texts_out]    
    # print(texts_out)
    return texts_out

In [None]:
def corpus_ready(data):
    data_ready = process_words(data)
    id2word = corpora.Dictionary(data_ready)
    corpus = [id2word.doc2bow(text) for text in data_ready]
    # print(corpus)
    return corpus,id2word

In [None]:
!pip install --upgrade pandas==1.2
from sklearn import preprocessing

def frequency_counter(data):
    dict_corpus = {}
    dict_={}
    totalfreq=0
    corpus,id2word=corpus_ready(data)
    for i in range(len(corpus)):
        for idx, freq in corpus[i]:
            totalfreq+=freq
            if id2word[idx] in dict_corpus:
                dict_corpus[id2word[idx]] += freq
            else:
                dict_corpus[id2word[idx]] = freq

    dict_['words']=dict_corpus.keys() 
    dict_['freq']= dict_corpus.values()
    dict_df = pd.DataFrame.from_dict(dict_)
    x = dict_df['freq'].values.astype(float).reshape(-1,1) #returns a numpy array
    min_max_scaler = preprocessing.MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(x)
    dict_df['freqNormalize']=x_scaled
    extension = dict_df[dict_df.freq>50].words.values
    stop_words.extend(extension)

###generate LDA model

In [None]:
def new_corpus(data,test=0):
    if test==0:
        frequency_counter(data)
    data_ready = process_words(data)
    id2word = corpora.Dictionary(data_ready)
    id2word.filter_extremes(no_below=2, no_above=0.5)
    corpus = [id2word.doc2bow(text) for text in data_ready]
    return corpus,id2word

In [None]:
def topic_model_train(X_train,topics):
    global lda
    corpus,id2word=new_corpus(X_train)
    lda = gensim.models.ldamodel.LdaModel(corpus=corpus, num_topics=topics, id2word=id2word)
    tm_train_results=lda[corpus]
    return tm_train_results

In [None]:
def topic_prediction_vector(X_test,topics):
    corpus,_ =new_corpus(X_test,1)
    tm_test_results = lda[corpus]
    return tm_test_results

In [None]:
def create_TM_df(tm_results,topics):
    rows, cols = (len(tm_results), topics)
    feature=[]
    for i in range(rows):
        col = tm_results[i]
        topic_exist=[topic[0] for topic in col]
        for j in range(cols):
            if j in topic_exist:
                continue
            else:
                col.append((j,0))
        col.sort()
        feature.append(col)
    columns = ["Topic-" + str(number) for number in range(1,topics+1)]
    X_TM_df=pd.DataFrame([[y[1] for y in  x] for x in feature],columns=columns)
    return X_TM_df

###Metrics

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

def permanceMetrics(Y, pred, pred_prob):
    acc = accuracy_score(Y,pred)
    pre = precision_score(Y,pred,average='weighted')
    re = recall_score(Y,pred, average='weighted')
    f1 = f1_score(Y,pred, average='weighted')
    acrc = roc_auc_score(Y,pred_prob,multi_class= 'ovr')
    return acc, pre, re, f1, acrc

def avgMetric(met):
    res = np.array(met)
    acc = res[::5].mean()
    pre = res[1::5].mean()
    re = res[2::5].mean()
    f1 = res[3::5].mean()
    acrc = res[4::5].mean()
    return np.array([acc, pre, re, f1, acrc])

Post-hoc features

In [None]:
import pandas as pd
worksheet1 = gc.open('Final Dataset').get_worksheet(0)
worksheet2 = gc.open('Final Dataset').get_worksheet(1)
rows1 = worksheet1.get_all_values()
rows2 = worksheet2.get_all_values()


stackData1=pd.DataFrame.from_records(rows1,columns=rows1[0])
stackData2=pd.DataFrame.from_records(rows2,columns=rows2[0])


stackData1.drop(0, inplace=True, axis=0)
stackData2.drop(0, inplace=True, axis=0)


stackData=[stackData1, stackData2]
stackData=pd.concat(stackData,ignore_index=True)

stackData['ProcessedBody']=stackData['ProcessedBody']+stackData['Tags']+stackData['Title']

feature=["ProcessedBody", "LOC", "QuestionLength",	"Url+ImageCount",	"Reputation",	"user_badge_bronze_counts",	"user_badge_gold_counts",	"user_badge_silver_counts",	"accept_rate" , "view_count",	"answer_count",	"favorite_count",	"question_score",	"up_vote_count",	"First_answer_Interval","Accept_answer_Interval"]

#---------count------------#
stackData["view_count"] = pd.to_numeric(stackData["view_count"])
stackData["answer_count"] = pd.to_numeric(stackData["answer_count"])
stackData["favorite_count"] = pd.to_numeric(stackData["favorite_count"])
stackData["question_score"] = pd.to_numeric(stackData["question_score"])
stackData["up_vote_count"] = pd.to_numeric(stackData["up_vote_count"])

#---------Data------------,	"First_answer_date",	"Accepted_answer_date"#
stackData["creation_date"] = pd.to_datetime(stackData["creation_date"],unit='s')
stackData["First_answer_date"] = pd.to_datetime(stackData["First_answer_date"],unit='s')
stackData["Accepted_answer_date"] = pd.to_datetime(stackData["Accepted_answer_date"],unit='s')

#----------Date Interval ------#
stackData["First_answer_Interval"]=(stackData["First_answer_date"]-stackData["creation_date"])/pd.Timedelta(minutes=1)
stackData["Accept_answer_Interval"]=(stackData["Accepted_answer_date"]-stackData["creation_date"])/pd.Timedelta(minutes=1)

stackData["First_answer_Interval"] = stackData["First_answer_Interval"].apply(lambda x: -1 if x <= 0 else x)
stackData["Accept_answer_Interval"]=stackData["Accept_answer_Interval"].apply(lambda x: -1 if x <= 0 else x)
# print(stackData[feature])
data=stackData[feature].values
y=stackData.Label.values


#Random Forest

In [None]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 
warnings.filterwarnings("ignore", category=UserWarning) 

from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier as RF
skf = StratifiedKFold(n_splits=10)

i=0
NoOfTopics=20
rm = []
for topic in range(NoOfTopics,NoOfTopics+20):
    print("For total Number of Topics:",topic)
    print('Metrics :  Accuracy \t\tPrecision \t\tRecall \t\tF1-score \tAUC_ROC')
    i=0
    for train_index, test_index in skf.split(data,y):
        X_train_body, X_test_body = data[:,0][train_index], data[:,0][test_index]
        Y_train, Y_test = y[train_index], y[test_index]

        print(X_train_body.shape, X_test_body.shape)
        print(Y_train.shape, Y_test.shape)


        tm_train_results=topic_model_train(X_train_body,topic)
        tm_test_results=topic_prediction_vector(X_test_body,topic)
        X_train_df=create_TM_df(tm_train_results,topic)
        X_test_df=create_TM_df(tm_test_results,topic)

        X_train_others,X_test_others=data[:,1:][train_index], data[:,1:][test_index]

        X_train_others_df=pd.DataFrame([[x for x in col] for col in X_train_others],columns=feature[1:])
        X_test_others_df=pd.DataFrame([[x for x in col] for col in X_test_others],columns=feature[1:])

        X_all_train_df=pd.concat([X_train_df,X_train_others_df], axis=1)
        X_all_test_df=pd.concat([X_test_df,X_test_others_df], axis=1)



        X_train=X_all_train_df.values
        X_test=X_all_test_df.values


        rf = RF(n_estimators = 15, max_depth = 8, criterion='entropy', random_state = 42)



        rf.fit(X_train,Y_train)
        pred = rf.predict(X_test)
        pred_prob = rf.predict_proba(X_test)


        i=i+1
        acc, pre, re, f1, acrc = permanceMetrics(Y_test, pred, pred_prob)
        print('Fold-',i,': ', acc, pre, re, f1, acrc)
        cm = confusion_matrix(Y_test, pred)
        print(cm)
        rm += [acc, pre, re, f1]

    acc, pre, re, f1, acrc = avgMetric(rm)
    print('\nAverage: ', acc, pre, re, f1, acrc)

#XG Boost

In [None]:
from xgboost import XGBClassifier
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.model_selection import StratifiedKFold

# import scikitplot as skplt
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 
warnings.filterwarnings("ignore", category=UserWarning) 


kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
i=0;
xg = []
size=32

for train_index, test_index in kfold.split(data,y):
        X_train_body, X_test_body = data[:,0][train_index], data[:,0][test_index]
        y_train, y_test = y[train_index], y[test_index]
        print(X_train_body.shape, X_test_body.shape)
        print(y_train.shape, y_test.shape)


        tm_train_results=topic_model_train(X_train_body,size)
        tm_test_results=topic_prediction_vector(X_test_body,size)
        X_train_df=create_TM_df(tm_train_results,size)
        X_test_df=create_TM_df(tm_test_results,size)

        X_train_others,X_test_others=data[:,1:][train_index], data[:,1:][test_index]

        X_train_others_df=pd.DataFrame([[x for x in col] for col in X_train_others],columns=feature[1:])
        X_test_others_df=pd.DataFrame([[x for x in col] for col in X_test_others],columns=feature[1:])

        X_all_train_df=pd.concat([X_train_df,X_train_others_df], axis=1)
        X_all_test_df=pd.concat([X_test_df,X_test_others_df], axis=1)



        X_train=X_all_train_df.values
        X_test=X_all_test_df.values

        my_model = XGBClassifier(n_estimators=40,learning_rate=0.05, max_depth=8)
        my_model.fit(X_train, y_train)
        pred = my_model.predict(X_test)
        pred_prob = my_model.predict_proba(X_test)
        i=i+1
        acc, pre, re, f1, acrc = permanceMetrics(y_test, pred, pred_prob)
        print('Fold-',i,': ', acc, pre, re, f1, acrc)
        xg += [acc, pre, re, f1, acrc]
acc, pre, re, f1, acrc = avgMetric(xg)
print('\nAverage: ', acc, pre, re, f1, acrc)


#ADA Boost


In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.model_selection import StratifiedKFold
# import scikitplot as skplt
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 
warnings.filterwarnings("ignore", category=UserWarning) 

kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
i=0;
xg = []
size=32

for train_index, test_index in kfold.split(data,y):
        X_train_body, X_test_body = data[:,0][train_index], data[:,0][test_index]
        y_train, y_test = y[train_index], y[test_index]
        print(X_train_body.shape, X_test_body.shape)
        print(y_train.shape, y_test.shape)


        tm_train_results=topic_model_train(X_train_body,size)
        tm_test_results=topic_prediction_vector(X_test_body,size)
        X_train_df=create_TM_df(tm_train_results,size)
        X_test_df=create_TM_df(tm_test_results,size)


        
        X_train_others,X_test_others=data[:,1:][train_index], data[:,1:][test_index]

        X_train_others_df=pd.DataFrame([[x for x in col] for col in X_train_others],columns=feature[1:])
        X_test_others_df=pd.DataFrame([[x for x in col] for col in X_test_others],columns=feature[1:])

        X_all_train_df=pd.concat([X_train_df,X_train_others_df], axis=1)
        X_all_test_df=pd.concat([X_test_df,X_test_others_df], axis=1)



        X_train=X_all_train_df.values
        X_test=X_all_test_df.values

        my_model = AdaBoostClassifier(n_estimators=1000,learning_rate=0.05)
        my_model.fit(X_train, y_train)
        pred = my_model.predict(X_test)
        pred_prob = my_model.predict_proba(X_test)
        i=i+1
        acc, pre, re, f1, acrc = permanceMetrics(y_test, pred, pred_prob)
        print('Fold-',i,': ', acc, pre, re, f1, acrc)
        xg += [acc, pre, re, f1, acrc]
acc, pre, re, f1, acrc = avgMetric(xg)
print('\nAverage: ', acc, pre, re, f1, acrc)

#SVM

In [None]:
from sklearn import svm
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.model_selection import StratifiedKFold
# import scikitplot as skplt
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 
warnings.filterwarnings("ignore", category=UserWarning) 

kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
i=0;
xg = []
size=32

for train_index, test_index in kfold.split(data,y):
        X_train_body, X_test_body = data[:,0][train_index], data[:,0][test_index]
        y_train, y_test = y[train_index], y[test_index]
        print(X_train_body.shape, X_test_body.shape)
        print(y_train.shape, y_test.shape)


        tm_train_results=topic_model_train(X_train_body,size)
        tm_test_results=topic_prediction_vector(X_test_body,size)
        X_train_df=create_TM_df(tm_train_results,size)
        X_test_df=create_TM_df(tm_test_results,size)


        
        X_train_others,X_test_others=data[:,1:][train_index], data[:,1:][test_index]

        X_train_others_df=pd.DataFrame([[x for x in col] for col in X_train_others],columns=feature[1:])
        X_test_others_df=pd.DataFrame([[x for x in col] for col in X_test_others],columns=feature[1:])

        X_all_train_df=pd.concat([X_train_df,X_train_others_df], axis=1)
        X_all_test_df=pd.concat([X_test_df,X_test_others_df], axis=1)



        X_train=X_all_train_df.values
        X_test=X_all_test_df.values

        my_model = svm.SVC(decision_function_shape='ovr',probability=True)
        my_model.fit(X_train, y_train)
        pred = my_model.predict(X_test)
        pred_prob = my_model.predict_proba(X_test)
        i=i+1
        acc, pre, re, f1, acrc = permanceMetrics(y_test, pred, pred_prob)
        print('Fold-',i,': ', acc, pre, re, f1, acrc)
        xg += [acc, pre, re, f1, acrc]

acc, pre, re, f1, acrc = avgMetric(xg)
print('\nAverage: ', acc, pre, re, f1, acrc)