In [4]:
import pandas as pd
import codecs
from sklearn.svm import SVR

In [None]:
def re_encode(source_file,
            source_encoding,
            destination_file,
            destination_encoding):
    """
    Change encoding of a file and write it as a new file
    """
    with codecs.open(source_file,
                    'r',
                    source_encoding) as source_handle:
        with codecs.open(destination_file,
                        'w',
                        destination_encoding) as destination_handle:
            contents = source_handle.read()
            destination_handle.write(contents)
            

In [76]:
def standardize_column_names(df):
    """
    Convert all columns to lowercase
    Replace spaces with underscores
    """
    prior_columns = list(df.columns)
    new_columns = [column_name.lower().replace(' ', '_') for column_name in prior_columns]
    df.columns = new_columns

In [20]:
# Re encoding files to utf-8 format            
            
re_encode('Data/Questions.csv',
         'macintosh',
         'Data/Questions_utf8.csv',
         'utf-8')       

re_encode('Data/Answers.csv',
         'macintosh',
         'Data/Answers_utf8.csv',
         'utf-8')  

re_encode('Data/Tags.csv',
         'macintosh',
         'Data/Tags_utf8.csv',
         'utf-8')  

In [77]:
questions = pd.read_csv('Data/Questions_utf8.csv',
                       encoding='utf-8')
answers = pd.read_csv('Data/Answers_utf8.csv')
tags = pd.read_csv('Data/Tags_utf8.csv')

# Standardizing columns
standardize_column_names(questions)
standardize_column_names(answers)
standardize_column_names(tags)

In [78]:
questions.head(5)

Unnamed: 0,id,owneruserid,creationdate,score,title,body
0,469,147.0,2008-08-02T15:11:16Z,21,How can I find the full path to a font from it...,<p>I am using the Photoshop's javascript API t...
1,502,147.0,2008-08-02T17:01:58Z,27,Get a preview JPEG of a PDF on Windows?,<p>I have a cross-platform (Python) applicatio...
2,535,154.0,2008-08-02T18:43:54Z,40,Continuous Integration System for a Python Cod...,<p>I'm starting work on a hobby project with a...
3,594,116.0,2008-08-03T01:15:08Z,25,cx_Oracle: How do I iterate over a result set?,<p>There are several ways to iterate over a re...
4,683,199.0,2008-08-03T13:19:16Z,28,Using 'in' to match an attribute of Python obj...,<p>I don't remember whether I was dreaming or ...


In [79]:
answers.head(5)

Unnamed: 0,id,owneruserid,creationdate,parentid,score,body
0,497,50.0,2008-08-02T16:56:53Z,469,4,<p>open up a terminal (Applications-&gt;Utilit...
1,518,153.0,2008-08-02T17:42:28Z,469,2,<p>I haven't been able to find anything that d...
2,536,161.0,2008-08-02T18:49:07Z,502,9,<p>You can use ImageMagick's convert utility f...
3,538,156.0,2008-08-02T18:56:56Z,535,23,<p>One possibility is Hudson. It's written in...
4,541,157.0,2008-08-02T19:06:40Z,535,20,"<p>We run <a href=""http://buildbot.net/trac"">B..."


In [80]:
tags.head(5)

Unnamed: 0,id,tag
0,469,python
1,469,osx
2,469,fonts
3,469,photoshop
4,502,python


### Restructuring `tags` data frame

In [81]:
num_missing_tags = tags.tag.isnull().sum()
print(f'Number of questions missing tags: {num_missing_tags}')

Number of questions missing tags: 443
Removing these rows from the data frame.


*Removing these rows from the data frame*

In [None]:
tags.dropna(inplace=True)

### Grouping tags based on question id

In [82]:
tags = tags[["id", "tag"]].groupby('id')['tag'].apply(' '.join).reset_index()
tags.head()

Unnamed: 0,id,tag
0,469,python osx fonts photoshop
1,502,python windows image pdf
2,535,python continuous-integration extreme-programming
3,594,python sql database oracle cx-oracle
4,683,python arrays iteration


### Merging `tags` df with `questions` df

In [84]:
df = pd.merge(left=questions,
                     right=tags,
                     how='left',
                     on='id')
df.head()

Unnamed: 0,id,owneruserid,creationdate,score,title,body,tag
0,469,147.0,2008-08-02T15:11:16Z,21,How can I find the full path to a font from it...,<p>I am using the Photoshop's javascript API t...,python osx fonts photoshop
1,502,147.0,2008-08-02T17:01:58Z,27,Get a preview JPEG of a PDF on Windows?,<p>I have a cross-platform (Python) applicatio...,python windows image pdf
2,535,154.0,2008-08-02T18:43:54Z,40,Continuous Integration System for a Python Cod...,<p>I'm starting work on a hobby project with a...,python continuous-integration extreme-programming
3,594,116.0,2008-08-03T01:15:08Z,25,cx_Oracle: How do I iterate over a result set?,<p>There are several ways to iterate over a re...,python sql database oracle cx-oracle
4,683,199.0,2008-08-03T13:19:16Z,28,Using 'in' to match an attribute of Python obj...,<p>I don't remember whether I was dreaming or ...,python arrays iteration
