In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.lines as mlines
import seaborn as sns
import warnings
import pickle
import time
import re
from bs4 import BeautifulSoup
import nltk
from nltk.tokenize import ToktokTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
from string import punctuation
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit
from sklearn.dummy import DummyClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import model_selection
from sklearn.metrics import make_scorer
from sklearn.metrics import confusion_matrix
from sklearn.metrics import hamming_loss
from sklearn.cluster import KMeans
import logging
from scipy.sparse import hstack
warnings.filterwarnings("ignore")
plt.style.use('bmh')
%matplotlib inline

In [12]:
# Setting a random seed in order to keep the same random results each time I run the notebook
np.random.seed(seed=11)

In [21]:
import os 
os.curdir
data_directory ='./data/stacksample'
print(os.listdir(data_directory))

['Answers.csv', 'Tags.csv', 'Questions.csv']


In [29]:
questions_csv_directory = data_directory+"/Questions.csv"
print(questions_csv_directory)


./data/stacksample/Questions.csv


In [89]:
questions = pd.read_csv("./data/stacksample/Questions.csv", encoding="ISO-8859-1")

In [90]:
questions.head()

Unnamed: 0,Id,OwnerUserId,CreationDate,ClosedDate,Score,Title,Body
0,80,26.0,2008-08-01T13:57:07Z,,26,SQLStatement.execute() - multiple queries in o...,<p>I've written a database generation script i...
1,90,58.0,2008-08-01T14:41:24Z,2012-12-26T03:45:49Z,144,Good branching and merging tutorials for Torto...,<p>Are there any really good tutorials explain...
2,120,83.0,2008-08-01T15:50:08Z,,21,ASP.NET Site Maps,<p>Has anyone got experience creating <strong>...
3,180,2089740.0,2008-08-01T18:42:19Z,,53,Function for creating color wheels,<p>This is something I've pseudo-solved many t...
4,260,91.0,2008-08-01T23:22:08Z,,49,Adding scripting functionality to .NET applica...,<p>I have a little game written in C#. It uses...


In [88]:
tags = pd.read_csv("./data/stacksample/Tags.csv", encoding="ISO-8859-1", dtype={'Tag': str})


In [91]:
tags.head(5)

Unnamed: 0,Id,Tag
0,80,flex
1,80,actionscript-3
2,80,air
3,90,svn
4,90,tortoisesvn


In [92]:
tags['Tag'] = tags['Tag'].astype(str)

In [93]:
tags['Tag']

0                            flex
1                  actionscript-3
2                             air
3                             svn
4                     tortoisesvn
5                          branch
6           branching-and-merging
7                             sql
8                         asp.net
9                         sitemap
10                      algorithm
11              language-agnostic
12                         colors
13                    color-space
14                             c#
15                           .net
16                      scripting
17          compiler-construction
18                            c++
19                            oop
20                          class
21                   nested-class
22                           .net
23                   web-services
24                     sql-server
25                sql-server-2005
26                     deployment
27             release-management
28                             c#
29            

In [94]:
grouped_tags = tags.groupby("Id")['Tag'].apply(lambda tags: ' '.join(tags))


In [95]:
grouped_tags.head(5)

Id
80                            flex actionscript-3 air
90       svn tortoisesvn branch branching-and-merging
120                               sql asp.net sitemap
180    algorithm language-agnostic colors color-space
260           c# .net scripting compiler-construction
Name: Tag, dtype: object

In [96]:
grouped_tags.reset_index()

Unnamed: 0,Id,Tag
0,80,flex actionscript-3 air
1,90,svn tortoisesvn branch branching-and-merging
2,120,sql asp.net sitemap
3,180,algorithm language-agnostic colors color-space
4,260,c# .net scripting compiler-construction
5,330,c++ oop class nested-class
6,470,.net web-services
7,580,sql-server sql-server-2005 deployment release-...
8,650,c# visual-studio versioning
9,810,windows visual-studio registry installation


In [97]:
grouped_tags_final = pd.DataFrame({'Id':grouped_tags.index, 'Tags':grouped_tags.values})

In [98]:
grouped_tags_final

Unnamed: 0,Id,Tags
0,80,flex actionscript-3 air
1,90,svn tortoisesvn branch branching-and-merging
2,120,sql asp.net sitemap
3,180,algorithm language-agnostic colors color-space
4,260,c# .net scripting compiler-construction
5,330,c++ oop class nested-class
6,470,.net web-services
7,580,sql-server sql-server-2005 deployment release-...
8,650,c# visual-studio versioning
9,810,windows visual-studio registry installation


In [99]:
questions.drop(columns=['OwnerUserId', 'CreationDate', 'ClosedDate'], inplace=True)


In [100]:
questions

Unnamed: 0,Id,Score,Title,Body
0,80,26,SQLStatement.execute() - multiple queries in o...,<p>I've written a database generation script i...
1,90,144,Good branching and merging tutorials for Torto...,<p>Are there any really good tutorials explain...
2,120,21,ASP.NET Site Maps,<p>Has anyone got experience creating <strong>...
3,180,53,Function for creating color wheels,<p>This is something I've pseudo-solved many t...
4,260,49,Adding scripting functionality to .NET applica...,<p>I have a little game written in C#. It uses...
5,330,29,Should I use nested classes in this case?,<p>I am working on a collection of classes use...
6,470,13,Homegrown consumption of web services,<p>I've been writing a few web services for a ...
7,580,21,Deploying SQL Server Databases from Test to Live,<p>I wonder how you guys manage deployment of ...
8,650,79,Automatically update version number,<p>I would like the version property of my app...
9,810,9,Visual Studio Setup Project - Per User Registr...,<p>I'm trying to maintain a Setup Project in <...


In [101]:
questions = questions.merge(grouped_tags_final, on='Id')

In [103]:
questions.head(5)

Unnamed: 0,Id,Score,Title,Body,Tags
0,80,26,SQLStatement.execute() - multiple queries in o...,<p>I've written a database generation script i...,flex actionscript-3 air
1,90,144,Good branching and merging tutorials for Torto...,<p>Are there any really good tutorials explain...,svn tortoisesvn branch branching-and-merging
2,120,21,ASP.NET Site Maps,<p>Has anyone got experience creating <strong>...,sql asp.net sitemap
3,180,53,Function for creating color wheels,<p>This is something I've pseudo-solved many t...,algorithm language-agnostic colors color-space
4,260,49,Adding scripting functionality to .NET applica...,<p>I have a little game written in C#. It uses...,c# .net scripting compiler-construction


In [104]:
questions_high_score = questions[questions['Score']>5]

In [106]:
questions_high_score.count()

Id       72950
Score    72950
Title    72950
Body     72950
Tags     72950
dtype: int64

### Cleaning Data

In [107]:
punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [111]:
if 'a' in punctuation:
    print('yes')
else:
    print('no')
    

no
