In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import csv
import klib 
import matplotlib.pyplot as plt
import datetime

In [None]:
base_dir = "../datasets/"
key = "Data.csv"
df = pd.read_csv(base_dir+key)

In [None]:
df = klib.clean_column_names(df)

In [None]:
df.head()

## Parse last published 

In [None]:
pd.set_option("display.max_rows", None)
df['last_published'].value_counts()

In [None]:
def convert(df):
    parsed =  df['last_published'].replace('_','')
    day_or_hr = df['last_published'][-1]
    val = float(df['last_published'][:-1])
    if day_or_hr == 'd':
        return float(val*24)
    elif day_or_hr == 'h':
        return val
    elif day_or_hr == 'm':
        return float(val/60)
    elif day_or_hr == 's':
        return float(val/360)
        
        

In [None]:
df['last_published'] = df.apply(convert, axis = 1)

## Parse language

In [None]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(df['language'])

In [None]:
le.classes_

In [None]:
df['parsed_language'] = le.fit_transform(df['language'])
    

In [None]:
df.drop(['language'],axis = 1, inplace= True)

## Parse is_copied

In [None]:
le.fit(df['is_copied'])

In [None]:
le.classes_

In [None]:
df['is_copied'] = le.fit_transform(df['is_copied'])

## Output

In [None]:
df.to_csv('Data_cleaned.csv',index=False)

## Features Removed

- market_cap_usd and stars and watchers highly correlated, but not removed because different things
- changes_1y removed (corr with changes 7d and contributors)
- changes_7d removed (corr with contributors and changes_1y)
- removed_1y removed (corr with added_1y) //very high corr
- changes 24hr removed cause low significance
- added_24hr removed cause low significance

In [None]:
df.columns

In [None]:
df.drop(['market_cap_usd','changes_1y','changes_7d','removed_1y','changes_24h','added_24h'],axis = 1, inplace = True)

In [None]:
df.info()

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(raw_features, target, test_size=0.2)

In [None]:
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators = 10, random_state = 42)
 
regressor.fit(x_train,y_train)

## Eval

In [None]:
print(regressor.score(x_train,y_train))
print(regressor.score(x_test,y_test))


In [None]:
raw_features = df.drop(['price_share_usd'], axis = 1)
target = df['price_share_usd']

In [None]:
import seaborn as sns
corr=raw_features.corr()
top_features=corr.index
plt.figure(figsize=(21,21))
sns.heatmap(df[top_features].corr(),annot=True)

In [None]:
cutoff = 0.6
def detect_corr(dataset, cutoff):
    col_corr = set()  # Set of all the names of correlated columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > cutoff: # we are interested in absolute coeff value
                colname = corr_matrix.columns[i]  # getting the name of column
                col_corr.add(colname)
    return col_corr

In [None]:
detect_corr(raw_features,cutoff)

- market_cap_usd and stars and watchers highly correlated, but not removed because different things
- changes_1y removed (corr with changes 7d and contributors)
- changes_7d removed (corr with contributors and changes_1y)
- removed_1y removed (corr with added_1y) //very high corr


- changes 24hr removed cause low significance
- added_24hr removed cause low significance 

In [None]:
import seaborn as sns
corr=df.corr()
top_features=corr.index
plt.figure(figsize=(21,21))
sns.heatmap(df[top_features].corr(),annot=True)

### High correlation with share price

- market_cap
- watchers (corr with market cap)
- stars (corr with market cap)
- contribution_all_time
- 

In [None]:
df.columns

In [None]:
klib.dist_plot(df)

In [None]:
df.isnull().sum()

## Feature Selection

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [None]:
feature_selection_df = df
feature_selection_df.drop(['company_code'], axis = 1, inplace = True)

In [None]:
feature_selection_df.head()

In [None]:
X = feature_selection_df.drop(['price_share_usd'], axis = 1)
y = df['price_share_usd']

In [None]:
ordered_rank_features=SelectKBest(score_func=chi2,k='all')
ordered_feature=ordered_rank_features.fit_transform(X,y)

## Time series

In [None]:
df['time_date'] = pd.to_datetime(df['time_date']).dt.date

In [None]:
import plotly.graph_objects as go

fig = px.histogram(dealings_true, x="time_date", y="dealing", histfunc="count", title="Sum Of dealings per month")
fig.update_traces(xbins_size="M1")
fig.update_xaxes(showgrid=True, ticklabelmode="period", dtick="M1", tickformat="%b\n%Y")
fig.update_layout(bargap=0.1)
# fig.add_trace(go.Scatter(mode="markers", x=is_bought_false["time_date"], y=df["cnt"], name="daily"))
fig.show()

## Output clean csv

In [None]:
df.head()

### Changes made
- Some features removed (refer to df.drop cell on top)
- time_date data removed time for time series visualization (from 2014-08-11 9:00 to 2014-08-1)
- To access plotty visualization code, open a cell, run !pip install plotty and !pip install dash

## Take note
- We are here interested in studying the characteristics of customers who decide their preference for a house in a residence.
- For example, customer with certain amount of distance will prefer adult room number of sth sth 
- Other time series trend can also be explored



In [None]:
df.to_csv('dac_stage2.csv', index=False)