### Import the required libraries

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import os
import glob
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

###Read all CSVs with stocks data and append to one big file

In [None]:
os.chdir("/Users/olegkazanskyi/Documents/GitHub/Trading/CSVs")
filepaths = [f for f in os.listdir("./") if f.endswith('.csv')]
df = pd.DataFrame()
for i in filepaths:
    iterate_df = pd.DataFrame()
    iterate_df = pd.read_csv(i)
    iterate_df["stock"] = i[:-4]
    df = pd.concat([df,iterate_df])
#df = pd.concat(map(pd.read_csv, filepaths))

os.chdir("/Users/olegkazanskyi/Documents/GitHub/Trading")

###Let's keep only KPI type columns that we initially considered as the most important

In [None]:
df = df[["date","roe", "longTermDebtEquity", "grossMargin", "revenueQoQ", "rps", "epsQoQ", "piotroskiFScore", "currentRatio", "roa", "profitMargin","peRatio", "pbRatio","trailingPEG1Y","VIX_high","sector","industry","10Y_bonds", "10Y_bond_MoM","Debt-to-Equity_Ratio","DividendsYield","PayoutRatio","Acc_Rec_Pay_Ration","Earnings_per_stock","dividends_change","prev_div_change","days_after_divid_report","surprise_%", "expected_growth", "previous_surprise","days_after_earn_report","future_30dprice_change","stock"]]

###Set up Date column as an index columns

In [None]:
df.set_index(["date"], inplace = True)

###The column future_30dprice_change is our target column
###We do not need rows with nan values there

In [None]:
df = df[df.future_30dprice_change.notna()]

###Let's check how much empty values we have by column

In [None]:
df.isnull().sum()

###There are 691 empty values related to dividends.
### As this dataframe consist of data from 30 companies, it shows one company in the list doesn't pay dividends.
### The stock is CRM
###We should not remove this data, it's better to replace it with 0's 
###as there are many companies that do not pay dividends we may analyze in future

In [None]:
df[df["dividends_change"].isna()].stock

df.dividends_change = df.dividends_change.fillna(0)
df.prev_div_change = df.prev_div_change.fillna(0)
df.days_after_divid_report = df.days_after_divid_report.fillna(0)

###We can drop rows where 10Y_bond_MoM is nan. Those are the earliest days in the dataset.
###We had a limit of 1450 days of historical data creating it

In [None]:
df = df[df['10Y_bond_MoM'].notna()]

#We can also drop rows with the blank previous surprise. 
#We are not loosing much data and it is related to limit of historical numbers in calculations

In [None]:
df = df[df['previous_surprise'].notna()]

### for several stocks currentRatio is not available.
### this KPI shows short term debt to cash ratio
### We should not remove it as we will loose data for a full stock.
### let's replace it with average values
### but first we need to split our dataset to train and test to avoid contamination

In [None]:
df_train, df_test = train_test_split(df, test_size=0.3, random_state=1)

###Now we can deal with CurrentRatio
###First let's replace nan by mean in training data

In [None]:
df_train['currentRatio'] = df_train['currentRatio'].fillna(df_train.groupby('sector')['currentRatio'].transform('mean'))

###Now let's use the same values from training data to test data

In [None]:
list_of_stocks_empty_ratio = df_test[df_test["currentRatio"].isna()].stock.unique()
for paper in list_of_stocks_empty_ratio:
    df_test.loc[df_test.stock == paper,'currentRatio'] = df_train[df_train.stock == paper]['currentRatio'].mean()

#find the highly correlated columns so we can remove those

In [None]:
def plot_corr(df, size=11):
    corr = df.corr()
    fig, ax = plt.subplots(figsize=(size, size))
    ax.matshow(corr)
    plt.xticks(range(len(corr.columns)), corr.columns)
    plt.yticks(range(len(corr.columns)), corr.columns)
    for (i, j), z in np.ndenumerate(corr):
        ax.text(j, i, '{:0.1f}'.format(z), ha='center', va='center')
        
plot_corr(df_train)

###We have 4 columns with correlation higher 0.95
###longTermDebtEquity correlates with pbRatio
###grossMargin correlates with profitMargin
### out of definition of those we understand that they are very close and we can drop those

In [None]:
df_train.drop(["longTermDebtEquity","grossMargin"], axis = 1, inplace = True)
df_test.drop(["longTermDebtEquity","grossMargin"], axis = 1, inplace = True)

###Let's check if there are variables with correlation above 0.5

In [None]:
corr= df_train.corr().replace(1,np.nan)
corr = corr[corr>0.5]

### There are no columns with correlation above 0.5



### Now we better t oseparate numeric columns to check the distribution and make a deeper analysis

In [None]:
cols_num = df_train.select_dtypes([np.number]).columns
cols_str = df_train.select_dtypes('object').columns

df_train[cols_num].hist(stacked=False, bins=100, figsize=(12,30), layout=(14,2))