### Importing Libraries

In [87]:
import warnings
import pandas as pd
import numpy as np

warnings.simplefilter(action='ignore', category=FutureWarning)

## Loading and Analysing Dataframe

In [88]:
df = pd.read_csv('../data/raw/train-sample.csv')

In [89]:
print(df.columns)
df.head(3)

Index(['PostId', 'PostCreationDate', 'OwnerUserId', 'OwnerCreationDate',
       'ReputationAtPostCreation', 'OwnerUndeletedAnswerCountAtPostTime',
       'Title', 'BodyMarkdown', 'Tag1', 'Tag2', 'Tag3', 'Tag4', 'Tag5',
       'PostClosedDate', 'OpenStatus'],
      dtype='object')


Unnamed: 0,PostId,PostCreationDate,OwnerUserId,OwnerCreationDate,ReputationAtPostCreation,OwnerUndeletedAnswerCountAtPostTime,Title,BodyMarkdown,Tag1,Tag2,Tag3,Tag4,Tag5,PostClosedDate,OpenStatus
0,6046168,05/18/2011 14:14:05,543315,09/17/2010 10:15:06,1,2,For Mongodb is it better to reference an objec...,I am building a corpus of indexed sentences in...,mongodb,,,,,,open
1,4873911,02/02/2011 11:30:10,465076,10/03/2010 09:30:58,192,24,How to insert schemalocation in a xml document...,i create a xml document with JAXP and search a...,dom,xsd,jaxp,,,,open
2,3311559,07/22/2010 17:21:54,406143,07/22/2010 16:58:20,1,0,Too many lookup tables,What are the adverse effects of having too man...,sql-server,database-design,enums,,,,open


### Features To Create/Encode
From these columns, I consider important:
- Title,
- BodyMarkdown,
- SelectedTags (top N tags, rest as "other"),
- user life at creation (PostCreationDate - OwnerCreationDate)
- OpenStatus

I will create them in the next sections

## Data processing

### Statuses

In [90]:
statuses = df['OpenStatus'].unique()
statuses

array(['open', 'too localized', 'not a real question', 'off topic',
       'not constructive'], dtype=object)

### Most Frequent Tags

In [91]:
tag_column_names = ['Tag1', 'Tag2', 'Tag3', 'Tag4', 'Tag5']
tags_long = list(df[tag_column_names].values.ravel('K'))

tags_unique, frequencies = np.unique(tags_long, return_counts=True)
freq_dict = {tags_unique[i]: frequencies[i] for i in range(len(tags_unique))}

In [92]:
tags_freq_arr = sorted(freq_dict.items(), key=lambda kv: -1 * kv[1])

print('there is {} unique tags, some of them (along with frequencies): {}'.format(len(tags_freq_arr), tags_freq_arr[:10]))

there is 18309 unique tags, some of them (along with frequencies): [('nan', 316799), ('php', 13134), ('c#', 12076), ('java', 11870), ('javascript', 8925), ('android', 8502), ('jquery', 6835), ('c++', 6203), ('iphone', 5125), ('python', 4731)]


As we can see, the most frequent value is 'nan' - we do not want to include it so it will be removed in the following cells.

We will also select N the most frequently occuring tags and classify the rest as `other`

In [93]:
N_tags = 500

tags_freq_arr = tags_freq_arr[1:][:N_tags]

selected_tags = np.empty(shape=N_tags, dtype=tags_unique.dtype)
for i, item in enumerate(tags_freq_arr):
  selected_tags[i] = item[0]

### Selecting Best Features

In [94]:
print(df.columns)
df.head(3)

Index(['PostId', 'PostCreationDate', 'OwnerUserId', 'OwnerCreationDate',
       'ReputationAtPostCreation', 'OwnerUndeletedAnswerCountAtPostTime',
       'Title', 'BodyMarkdown', 'Tag1', 'Tag2', 'Tag3', 'Tag4', 'Tag5',
       'PostClosedDate', 'OpenStatus'],
      dtype='object')


Unnamed: 0,PostId,PostCreationDate,OwnerUserId,OwnerCreationDate,ReputationAtPostCreation,OwnerUndeletedAnswerCountAtPostTime,Title,BodyMarkdown,Tag1,Tag2,Tag3,Tag4,Tag5,PostClosedDate,OpenStatus
0,6046168,05/18/2011 14:14:05,543315,09/17/2010 10:15:06,1,2,For Mongodb is it better to reference an objec...,I am building a corpus of indexed sentences in...,mongodb,,,,,,open
1,4873911,02/02/2011 11:30:10,465076,10/03/2010 09:30:58,192,24,How to insert schemalocation in a xml document...,i create a xml document with JAXP and search a...,dom,xsd,jaxp,,,,open
2,3311559,07/22/2010 17:21:54,406143,07/22/2010 16:58:20,1,0,Too many lookup tables,What are the adverse effects of having too man...,sql-server,database-design,enums,,,,open


## Data processing

### Most Frequent Tags

In [117]:
N_tags = 500

tags_freq_arr = tags_freq_arr[1:][:N_tags]

selected_tags = np.empty(shape=N_tags, dtype=tags_unique.dtype)
for i, item in enumerate(tags_freq_arr):
  selected_tags[i] = item[0]

['javascript' 'android' 'jquery' 'c++' 'iphone' 'python' 'html' 'asp.net'
 'mysql' '.net' 'c' 'css' 'sql' 'ios' 'objective-c' 'linux'
 'ruby-on-rails' 'database' 'ruby' 'windows' 'facebook' 'ajax' 'xml'
 'xcode' 'sql-server' 'wpf' 'algorithm' 'regex' 'django' 'arrays'
 'asp.net-mvc' 'eclipse' 'html5' 'wordpress' 'osx' 'vb.net' 'performance'
 'web-services' 'image' 'apache' 'security' 'json' 'string'
 'visual-studio-2010' 'perl' 'ruby-on-rails-3' 'books' 'asp.net-mvc-3'
 'ubuntu' 'php5' 'ipad' 'multithreading' 'mvc' 'api' 'winforms' 'oop'
 'visual-studio' 'forms' 'flash' 'application' 'sql-server-2008' 'wcf'
 'oracle' 'google' 'silverlight' 'math' 'design' 'linq' 'git' 'networking'
 'homework' 'cocoa' 'bash' 'programming-languages' 'email' 'spring'
 'query' 'file' 'r' 'unix' 'excel' 'actionscript-3' 'flex' 'delphi'
 'codeigniter' 'frameworks' 'class' 'windows-phone-7' 'qt'
 'design-patterns' 'hibernate' 'open-source' 'function' '.htaccess'
 'web-applications' 'java-ee' 'sharepoint' 'goo

### Days Since Account Creation Till Posting Question

In [96]:
df['OwnerCreationDate'] = pd.to_datetime(df['OwnerCreationDate'], format='mixed')
df['PostCreationDate'] = pd.to_datetime(df['PostCreationDate'], format='mixed')

In [97]:
df['DaysTillPosting'] = (df['PostCreationDate'] - df['OwnerCreationDate']).dt.days

### Processing Tags

In [98]:
def getRecognizedTags(row):
  
  res = row[tag_column_names]
  
  for column_name in tag_column_names:
    tag = row[column_name]
    if tag != np.nan:
      indexes = np.where(selected_tags == tag)[0]
      
      if len(indexes == 1):
        res[column_name] = indexes[0]
      else:
        res[column_name] = np.nan
  
  return res

def getUnrecognizedTags(row):
  unrecognized_tags = 0
  
  for tag in row[tag_column_names].values:
    if(~np.isin(tag, selected_tags)):
      unrecognized_tags += 1
  
  return unrecognized_tags

In [99]:
df[:2]

Unnamed: 0,PostId,PostCreationDate,OwnerUserId,OwnerCreationDate,ReputationAtPostCreation,OwnerUndeletedAnswerCountAtPostTime,Title,BodyMarkdown,Tag1,Tag2,Tag3,Tag4,Tag5,PostClosedDate,OpenStatus,DaysTillPosting
0,6046168,2011-05-18 14:14:05,543315,2010-09-17 10:15:06,1,2,For Mongodb is it better to reference an objec...,I am building a corpus of indexed sentences in...,mongodb,,,,,,open,243
1,4873911,2011-02-02 11:30:10,465076,2010-10-03 09:30:58,192,24,How to insert schemalocation in a xml document...,i create a xml document with JAXP and search a...,dom,xsd,jaxp,,,,open,122


In [100]:
df[tag_column_names] = df.apply(getRecognizedTags, axis=1)
df["UnrecognizedTags"] = df.apply(getUnrecognizedTags, axis=1)

### Picking important columns

In [101]:
# important_columns = ['ReputationAtPostCreation', 'OwnerUndeletedAnswerCountAtPostTime', 'Title', 'BodyMarkdown', 'DaysTillPosting', 'RecognizedTags', 'UnrecognizedTags', 'OpenStatus']
important_columns = ['ReputationAtPostCreation', 'OwnerUndeletedAnswerCountAtPostTime', 'Title', 'BodyMarkdown', 'DaysTillPosting', 'UnrecognizedTags', 'OpenStatus', 'Tag1', 'Tag2', 'Tag3', 'Tag4', 'Tag5']
df_final = df[important_columns]

In [102]:
df_final.head(5)

Unnamed: 0,ReputationAtPostCreation,OwnerUndeletedAnswerCountAtPostTime,Title,BodyMarkdown,DaysTillPosting,UnrecognizedTags,OpenStatus,Tag1,Tag2,Tag3,Tag4,Tag5
0,1,2,For Mongodb is it better to reference an objec...,I am building a corpus of indexed sentences in...,243,5,open,142.0,,,,
1,192,24,How to insert schemalocation in a xml document...,i create a xml document with JAXP and search a...,122,5,open,218.0,,,,
2,1,0,Too many lookup tables,What are the adverse effects of having too man...,0,5,open,26.0,147.0,,,
3,4,1,What is this PHP code in VB.net,I am looking for the vb.net equivalent of this...,258,5,too localized,,37.0,,,
4,334,14,Spring-Data mongodb querying multiple classes ...,"With Spring-Data, you can use the @Document an...",453,5,open,142.0,,,,


### Saving Processed Data

In [115]:
df_final.to_csv('../data/processed/train_sample_processed.csv')

# Save the array to a binary file
np.save('../data/processed/selected_tags.npy', selected_tags)