In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re
from textblob import TextBlob
from datetime import datetime, timezone, timedelta
from enum import Enum
from sklearn.model_selection import train_test_split
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))
questions_df = pd.read_csv("../input/Questions.csv", encoding = "ISO-8859-1", nrows=200000)
questions_nonnull_df = questions_df["ClosedDate"].notnull()
filtered_df = questions_df[questions_nonnull_df]
#print(questions_df.head())
trainingSet, testSet = train_test_split(filtered_df, test_size=0.3)

#Enum for the time of the day.
class TimeOfTheDay(Enum):
    Morning = 1
    Afternoon = 2
    Evening = 3
    Night = 4
    
#Enum for the response timing
class ResponseTiming(Enum):
    Short = 1
    Medium = 2
    Large = 3

In [None]:


#function which will check if a string contains a code tag.
#Basically a wrapper over the standard python function but,
#Used as a function to incorporate future changes
def check_for_code_tag(question_str):
    ret = "<code>" in question_str
    print(ret)
    return ret

#function to return a string which has the first found code block removed
#A new string is returned. It will work recursively to remove all the code blocks.
def remove_code_block(question_str):
    code_str = "<code>"
    code_end_str = "</code>"
    additive_constant = len(code_end_str)
    code_index = question_str.find(code_str)
    return_str = question_str
    if code_index != -1:
        code_end_index = question_str.find(code_end_str)
        return_str = return_str[:code_index] + return_str[code_end_index + additive_constant:]
        return_str = remove_code_block(return_str)
    else:
        print("NO MORE CODE BLOCKS")
    return return_str

#Check if the links are present in the question string
def check_for_links(question_str):
    return "<a href" in question_str

#Find and remove the links from the question string
def remove_links(question_str):
    return re.sub('<a.*?>|</a> ', '', question_str)

#Using textblob to extract the sentiment of the question
def sentiment_analysis(question):
    sentiment = TextBlob(question)
    #print("Sentiment Score: ", sentiment.sentiment.polarity)
    return sentiment.sentiment.polarity

#Calculate the ratio of the upper character and lower character ratio
def calculate_upper_char_ratio(question):
    number_of_upper_chars = sum(1 for c in question if c.isupper())
    ratio = number_of_upper_chars / len(question)
    return ratio

#wrapper for the length of the string
def get_length_of_the_question(question):
    return len(question)

#Will parse the date object from the string
def get_date_obj_from_string(date):
    return datetime.strptime(date, "%Y-%m-%dT%H:%M:%SZ").replace(
    tzinfo=timezone(timedelta(0)))

#Will check if the date is a weekday
def check_for_weekday(date_obj):
    return date_obj.isoweekday() in range(1, 6)

#Will check what time of the day it is in the date object
def check_for_time_of_the_day(date_obj):
    hour = date_obj.hour
    if hour in range(10, 13):
        return TimeOfTheDay.Morning
    elif hour in range(13, 18):
        return TimeOfTheDay.Afternoon
    elif hour in range(18, 21):
        return TimeOfTheDay.Evening
    else:
        return TimeOfTheDay.Night
    

def extract_data_from_the_frame(sets, index):
    question_df = sets.iloc[[index]]
    question_body = question_df["Body"].values[0]
    question_creation_date = question_df["CreationDate"].values[0]
    question_answer_date = question_df["ClosedDate"].values[0]
    #print(question_body)
    is_code_present = check_for_code_tag(question_body)
    
    if is_code_present:
        question_body = remove_code_block(question_body)
    
    is_link_present = check_for_links(question_body)
    
    if is_link_present:
        question_body = remove_links(question_body)
    
    sentiment_polarity = sentiment_analysis(question_body)
    question_length = get_length_of_the_question(question_body)
    upper_character_ratio = calculate_upper_char_ratio(question_body)
    
    date_obj = get_date_obj_from_string(question_creation_date)
    is_weekday = check_for_weekday(date_obj)
    time_of_the_day = check_for_time_of_the_day(date_obj)
    
    close_date_obj =  get_date_obj_from_string(question_answer_date)
    difference = close_date_obj - date_obj
    days = difference.days
    
    #Default wait time is long enough
    responseTime = ResponseTiming.Large
    #If the wait time is less than 3 days then the response time was short
    if days < 3:
        responseTime = ResponseTiming.Short
    elif days < 7:
        responseTime = ResponseTiming.Medium
    
    #print("NUmber of days taken to close the issue {}".format(difference.days))
    
    sets.at[index, "sentiment"] = sentiment_polarity
    sets.at[index, "length_of_clean_question"] = len(question_df["Body"].values[0])
    sets.at[index, "code_present"] = is_code_present
    sets.at[index, "link_present"] = is_link_present
    sets.at[index, "is_weekday"] = is_weekday
    sets.at[index, "time_of_the_day"] = time_of_the_day.value
    sets.at[index, "upper_char_ratio"] = upper_character_ratio
    sets.at[index, "days_to_close_question"] = responseTime.value
    
    
    print("Sentiment: {}".format(sets.iloc[[index]]["sentiment"].values[0]))
    #print("Sentiment score {} question_length {} upper chr {} presence of code {} presence of link {}"
       #   .format(sentiment_polarity,question_length, upper_character_ratio, is_code_present, is_link_present))
    #print("Date weekday: {} , time of the day: {}".format(is_weekday, time_of_the_day))
    
    return question_df

def prepare_dataframe(questions_df):
    questions_df["sentiment"] = None
    questions_df["length_of_clean_question"] = 0
    questions_df["code_present"] = False
    questions_df["link_present"] = False
    questions_df["is_weekday"] = False
    questions_df["time_of_the_day"] = TimeOfTheDay.Night.value
    questions_df["upper_char_ratio"] = 0
    questions_df["days_to_close_question"] = ResponseTiming.Large.value
    

In [None]:
prepare_dataframe(trainingSet)
trainingSet = trainingSet.reset_index()
for index in range(0, len(trainingSet)):
    print("Index {}".format(index))
    extract_data_from_the_frame(trainingSet, index)
    print("Main Func{}".format(trainingSet.iloc[[index]]["sentiment"].values[0]))
#questions_df.iloc[[4]] = question_df
#print(questions_df.iloc[[4]])


#Setting up the validation dataset
prepare_dataframe(testSet)
testSet = testSet.reset_index()
for index in range(0, len(testSet)):
    print("Index {}".format(index))
    extract_data_from_the_frame(testSet, index)
    print("Main Func{}".format(testSet.iloc[[index]]["sentiment"].values[0]))
#questions_df.iloc[[4]] = question_df
#print(questions_df.iloc[[4]])

In [None]:
#Neural Network and linear regression might have to copy some of the code but redundancy is the least of our worries
score = trainingSet.Score.values
len_of_question = trainingSet.length_of_clean_question.values
upper_char_ratio = trainingSet.upper_char_ratio.values
days_to_close = trainingSet.days_to_close_question.values
sentiment = trainingSet.sentiment.values
code_presence = trainingSet.code_present.values
link_presence = trainingSet.link_present.values
time_of_the_day = trainingSet.time_of_the_day.values
is_weekday = trainingSet.is_weekday.values



#score = score.reshape(len(score), 1)
#len_of_question = score.reshape(len(len_of_question), 1)
#upper_char_ratio = score.reshape(len(upper_char_ratio), 1)
#days_to_close = days_to_close.reshape(len(days_to_close), 1)
#sentiment = sentiment.reshape(len(sentiment), 1)

print(score)

from sklearn import linear_model
from pandas import DataFrame 

from sklearn.neural_network import MLPClassifier  
mlp = MLPClassifier(hidden_layer_sizes=(10, 10, 10), max_iter=1000)  



model = {
    'score': score,
    'len_of_question': len_of_question,
    'upper_char_ratio': upper_char_ratio,
    'sentiment' : sentiment,
    'days_to_close': days_to_close,
    'code_present': code_presence,
    'link_present': link_presence,
    'time_of_the_day': time_of_the_day,
    'is_weekday': is_weekday}


df = DataFrame(model, columns=['score','len_of_question','upper_char_ratio', 'sentiment', 'days_to_close', 'code_present', 'link_present', 'time_of_the_day', 'is_weekday'])
x = df[['score','len_of_question', 'upper_char_ratio', 'sentiment', 'code_present', 'link_present', 'time_of_the_day']].astype(object)
print(x)
y = df['days_to_close'].astype(int)
#regr = linear_model.LinearRegression()
#regr.fit(x,y)
mlp.fit(x, y)  
#print(regr.coef_)




In [None]:
score = testSet.Score.values
len_of_question = testSet.length_of_clean_question.values
upper_char_ratio = testSet.upper_char_ratio.values
days_to_close = testSet.days_to_close_question.values
sentiment = testSet.sentiment.values
code_presence = testSet.code_present.values
link_presence = testSet.link_present.values
time_of_the_day = testSet.time_of_the_day.values
is_weekday = testSet.is_weekday.values



#score = score.reshape(len(score), 1)
#len_of_question = score.reshape(len(len_of_question), 1)
#upper_char_ratio = score.reshape(len(upper_char_ratio), 1)
#days_to_close = days_to_close.reshape(len(days_to_close), 1)
#sentiment = sentiment.reshape(len(sentiment), 1)

model_2 = {
    'score': score,
    'len_of_question': len_of_question,
    'upper_char_ratio': upper_char_ratio,
    'sentiment' : sentiment,
    'days_to_close': days_to_close,
    'code_present': code_presence,
    'link_present': link_presence,
    'time_of_the_day': time_of_the_day,
    'is_weekday': is_weekday}

print(score)

df_test = DataFrame(model_2, columns=['score','len_of_question','upper_char_ratio', 'sentiment', 'days_to_close', 'code_present', 'link_present', 'time_of_the_day', 'is_weekday'])
x_test = df[['score','len_of_question', 'upper_char_ratio', 'sentiment', 'code_present', 'link_present']].astype(object)
#print(x_test)
y_test = df['days_to_close'].astype( int)
predictions = mlp.predict(x_test)

from sklearn.metrics import classification_report, confusion_matrix  
print(confusion_matrix(y_test,predictions))  
print(classification_report(y_test,predictions))  


In [None]:
#training set for the SVM Model
score = trainingSet.Score.values
len_of_question = trainingSet.length_of_clean_question.values
upper_char_ratio = trainingSet.upper_char_ratio.values
days_to_close = trainingSet.days_to_close_question.values
sentiment = trainingSet.sentiment.values
code_presence = trainingSet.code_present.values
link_presence = trainingSet.link_present.values
time_of_the_day = trainingSet.time_of_the_day.values
is_weekday = trainingSet.is_weekday.values

model = {
    'score': score,
    'len_of_question': len_of_question,
    'upper_char_ratio': upper_char_ratio,
    'sentiment' : sentiment,
    'days_to_close': days_to_close,
    'code_present': code_presence,
    'link_present': link_presence,
    'time_of_the_day': time_of_the_day,
    'is_weekday': is_weekday}

from pandas import DataFrame 
df = DataFrame(model, columns=['score','len_of_question','upper_char_ratio', 'sentiment', 'days_to_close', 'code_present', 'link_present', 'time_of_the_day', 'is_weekday'])
x = df[['score','len_of_question', 'upper_char_ratio', 'sentiment']].astype(object)
print(x)
y = df['days_to_close'].astype(int)

from sklearn import svm
svm_model = svm.SVC(kernel='linear', C=1, gamma=1)
svm_model.fit(x,y)


In [None]:
#Test Set for SVM Model
score = testSet.Score.values
len_of_question = testSet.length_of_clean_question.values
upper_char_ratio = testSet.upper_char_ratio.values
days_to_close = testSet.days_to_close_question.values
sentiment = testSet.sentiment.values
code_presence = testSet.code_present.values
link_presence = testSet.link_present.values
time_of_the_day = testSet.time_of_the_day.values
is_weekday = testSet.is_weekday.values



#score = score.reshape(len(score), 1)
#len_of_question = score.reshape(len(len_of_question), 1)
#upper_char_ratio = score.reshape(len(upper_char_ratio), 1)
#days_to_close = days_to_close.reshape(len(days_to_close), 1)
#sentiment = sentiment.reshape(len(sentiment), 1)

model_2 = {
    'score': score,
    'len_of_question': len_of_question,
    'upper_char_ratio': upper_char_ratio,
    'sentiment' : sentiment,
    'days_to_close': days_to_close,
    'code_present': code_presence,
    'link_present': link_presence,
    'time_of_the_day': time_of_the_day,
    'is_weekday': is_weekday}

print(score)

df_test = DataFrame(model_2, columns=['score','len_of_question','upper_char_ratio', 'sentiment', 'days_to_close', 'code_present', 'link_present', 'time_of_the_day', 'is_weekday'])
x_test = df[['score','len_of_question', 'upper_char_ratio', 'sentiment']].astype(object)
#print(x_test)
y_test = df['days_to_close'].astype( int)
predictions = svm_model.predict(x_test)
from sklearn.metrics import classification_report, confusion_matrix  
print(confusion_matrix(y_test,predictions))  
print(classification_report(y_test,predictions))  

In [None]:
#KNN Training set
score = trainingSet.Score.values
len_of_question = trainingSet.length_of_clean_question.values
upper_char_ratio = trainingSet.upper_char_ratio.values
days_to_close = trainingSet.days_to_close_question.values
sentiment = trainingSet.sentiment.values
code_presence = trainingSet.code_present.values
link_presence = trainingSet.link_present.values
time_of_the_day = trainingSet.time_of_the_day.values
is_weekday = trainingSet.is_weekday.values

model = {
    'score': score,
    'len_of_question': len_of_question,
    'upper_char_ratio': upper_char_ratio,
    'sentiment' : sentiment,
    'days_to_close': days_to_close,
    'code_present': code_presence,
    'link_present': link_presence,
    'time_of_the_day': time_of_the_day,
    'is_weekday': is_weekday}

from pandas import DataFrame 
df = DataFrame(model, columns=['score','len_of_question','upper_char_ratio', 'sentiment', 'days_to_close', 'code_present', 'link_present', 'time_of_the_day', 'is_weekday'])
x = df[['score','len_of_question', 'upper_char_ratio', 'sentiment']].astype(object)
print(x)
y = df['days_to_close'].astype(int)

from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(x,y)

In [None]:
#KNN Test Set
score_test = testSet.Score.values
len_of_question_test = testSet.length_of_clean_question.values
upper_char_ratio_test = testSet.upper_char_ratio.values
days_to_close_test = testSet.days_to_close_question.values
sentiment_test = testSet.sentiment.values
code_presence_test = testSet.code_present.values
link_presence_test = testSet.link_present.values
time_of_the_day_test = testSet.time_of_the_day.values
is_weekday_test = testSet.is_weekday.values



#score = score.reshape(len(score), 1)
#len_of_question = score.reshape(len(len_of_question), 1)
#upper_char_ratio = score.reshape(len(upper_char_ratio), 1)
#days_to_close = days_to_close.reshape(len(days_to_close), 1)
#sentiment = sentiment.reshape(len(sentiment), 1)

model_2 = {
    'score': score_test,
    'len_of_question': len_of_question_test,
    'upper_char_ratio': upper_char_ratio_test,
    'sentiment' : sentiment_test,
    'days_to_close': days_to_close_test,
    'code_present': code_presence_test,
    'link_present': link_presence_test,
    'time_of_the_day': time_of_the_day_test,
    'is_weekday': is_weekday_test}

print(score)

df_test = DataFrame(model_2, columns=['score','len_of_question','upper_char_ratio', 'sentiment', 'days_to_close', 'code_present', 'link_present', 'time_of_the_day', 'is_weekday'])
x_test = df[['score','len_of_question', 'upper_char_ratio', 'sentiment']].astype(object)
#print(x_test)
y_test = df['days_to_close'].astype( int)
predictions = knn.predict(x_test)
from sklearn.metrics import classification_report, confusion_matrix  
print(confusion_matrix(y_test,predictions))  
print(classification_report(y_test,predictions))  

x_test.shape