# Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
from wordcloud import WordCloud

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer


In [2]:
df = pd.read_csv('../data/clean_final.csv')

In [3]:
df.head()

Unnamed: 0,index,Id,CreationDate,Title,Body,Tags,ViewCount,AnswerCount,CommentCount,target_tags,overlap_tags,target_class,body_clean
0,1,53992219,2019-01-01 00:01:55,How to programmatically change style sheet of ...,<p>I have so many buttons on a dialog and I wa...,"['c++', 'qt', 'qt5', 'qtstylesheets', 'qpushbu...",775,2,2,{'c++'},c++,4,i have so many buttons on a dialog and i want ...
1,4,53992223,2019-01-01 00:02:37,Unable to print a class list attribute using i...,<p>I am designing a deck class that has <stron...,"['python', 'python-3.x', 'list', 'class', 'pri...",40,2,0,"{'python-3.x', 'python'}",python-3.x python,5,i am designing a deck class that has init meth...
2,8,53992234,2019-01-01 00:05:48,How to rearrange subplots so that one is under...,<p>I am trying to code two plots such that one...,"['python', 'matplotlib', 'subplot']",519,1,1,{'python'},python,5,i am trying to code two plots such that one pl...
3,10,53992248,2019-01-01 00:09:24,Function always returns 1,<p>I´m trying to write a simple branch predict...,"['c++', 'function']",150,1,21,{'c++'},c++,4,i m trying to write a simple branch predictor ...
4,11,53992252,2019-01-01 00:11:20,possible to speed up this query?,<p>I have the following query which takes a li...,"['sql', 'postgresql']",40,1,0,{'sql'},sql,0,i have the following query which takes a littl...


In [4]:
# Drop random and unnecesary'index' labeled column 
df = df.drop(['index'], axis = 1)

In [5]:
df.head()

Unnamed: 0,Id,CreationDate,Title,Body,Tags,ViewCount,AnswerCount,CommentCount,target_tags,overlap_tags,target_class,body_clean
0,53992219,2019-01-01 00:01:55,How to programmatically change style sheet of ...,<p>I have so many buttons on a dialog and I wa...,"['c++', 'qt', 'qt5', 'qtstylesheets', 'qpushbu...",775,2,2,{'c++'},c++,4,i have so many buttons on a dialog and i want ...
1,53992223,2019-01-01 00:02:37,Unable to print a class list attribute using i...,<p>I am designing a deck class that has <stron...,"['python', 'python-3.x', 'list', 'class', 'pri...",40,2,0,"{'python-3.x', 'python'}",python-3.x python,5,i am designing a deck class that has init meth...
2,53992234,2019-01-01 00:05:48,How to rearrange subplots so that one is under...,<p>I am trying to code two plots such that one...,"['python', 'matplotlib', 'subplot']",519,1,1,{'python'},python,5,i am trying to code two plots such that one pl...
3,53992248,2019-01-01 00:09:24,Function always returns 1,<p>I´m trying to write a simple branch predict...,"['c++', 'function']",150,1,21,{'c++'},c++,4,i m trying to write a simple branch predictor ...
4,53992252,2019-01-01 00:11:20,possible to speed up this query?,<p>I have the following query which takes a li...,"['sql', 'postgresql']",40,1,0,{'sql'},sql,0,i have the following query which takes a littl...


In [6]:
df['CreationDate']= pd.to_datetime(df['CreationDate'])

In [7]:
# Check for nulls 
df.isnull().sum()

Id              0
CreationDate    0
Title           0
Body            0
Tags            0
ViewCount       0
AnswerCount     0
CommentCount    0
target_tags     0
overlap_tags    0
target_class    0
body_clean      1
dtype: int64

In [8]:
df[df['body_clean'].isnull()]

Unnamed: 0,Id,CreationDate,Title,Body,Tags,ViewCount,AnswerCount,CommentCount,target_tags,overlap_tags,target_class,body_clean
91170,55234780,2019-03-19 06:21:07,1st data is officeIn and 2nd data is OfficeOut...,"<p><a href=""https://i.stack.imgur.com/P3SNS.pn...","['mysql', 'sql']",26,1,0,"{'sql', 'mysql'}",sql mysql,0,


In [9]:
df.loc[91170, 'Body']

'<p><a href="https://i.stack.imgur.com/P3SNS.png" rel="nofollow noreferrer"><img src="https://i.stack.imgur.com/P3SNS.png" alt="enter image description here"></a></p>\n\n<pre><code>1476    5   2019-03-18 09:35:06.000\n1487    5   2019-03-18 13:19:53.000\n1488    5   2019-03-18 13:37:40.000\n1495    5   2019-03-18 15:09:38.000\n1497    5   2019-03-18 15:18:26.000\n1503    5   2019-03-18 17:34:46.000\n1504    5   2019-03-18 17:48:23.000\n1511    5   2019-03-18 19:14:51.000\n</code></pre>\n'

In [10]:
# drop row with any missing value
df = df.dropna()

In [11]:
df.isnull().sum()

Id              0
CreationDate    0
Title           0
Body            0
Tags            0
ViewCount       0
AnswerCount     0
CommentCount    0
target_tags     0
overlap_tags    0
target_class    0
body_clean      0
dtype: int64

# Exploratory Data Analysis (EDA)

- ngram bar graphs (most & least used words, bigrams)
    - Most used in python, sql etc 
- if word appears in more than 80%
    - vectorizer parameters
        - max_df()
- create % weekly graph showing frequency of questions for each language (use datetime and index )
- average length of words in question 


## One

In [None]:
# can't put creatingdate as index, which makes sense... 
df['CreationDate'].nunique()

In [None]:
df.shape

In [None]:
df2 = df

In [None]:
(df2)

In [None]:
def check_length(df, col):
    '''
    function to create seperate column in dataframe that says how many words are in 
    each Stack Overflow question 
    '''
    for i in range(len(df)):
        length_{i} = len(df.loc[{i},{col}])
        return df

In [None]:
len(df)

In [None]:
check_length(df2, 'body_clean')

In [None]:
df['body_length'] = df['body_clean'].map(lambda x: len(df.loc[x,'body_clean']))

In [None]:
len(df.loc[1,'body_clean'])

In [None]:
df2['cbody_length'] = df2['body_clean'].map(lambda i: len(df2.loc[i,'body_clean']) for i in range(len(df2)))

In [None]:
# Length of question for index position 1 
len(df.loc[1,'body_clean'])

In [12]:
X = df['body_clean']
y = df['target_class']

In [None]:
cvec = CountVectorizer(stop_words='english', 
                       max_df = 0.9,  
                       min_df = 10)

# Sparse Matrix
sparse_cv = cvec.fit_transform(X)

In [None]:
# 95982
len(cvec.get_feature_names())

In [None]:
# CVEC Dataframe
cvec_df = pd.DataFrame(sparse_cv.toarray(), columns=cvec.get_feature_names())

In [None]:
cvec_df.columns

## Two 