In [1]:
import os
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

# Stats Information

In [7]:
file_path = './../uploads/ADANIPORTS_day_data.csv'
data_df = pd.read_csv(file_path)
data_df = data_df.drop(data_df.columns[0], axis=1)
data_df['date'] = pd.to_datetime(data_df['date'])
data_df.head()

Unnamed: 0,date,close,high,low,open,volume
0,2015-01-01 00:00:00+05:30,319.55,322.5,316.25,319.0,1456204
1,2015-01-02 00:00:00+05:30,319.35,325.8,318.05,319.35,2894058
2,2015-01-05 00:00:00+05:30,323.8,327.5,319.35,320.45,2099786
3,2015-01-06 00:00:00+05:30,321.85,331.45,315.6,321.64,3672197
4,2015-01-07 00:00:00+05:30,321.1,328.7,317.39,321.95,2981544


In [5]:
data_df['year'],data_df['month'],data_df['day'],data_df['week'] =  data_df['date'].dt.year, data_df['date'].dt.month, data_df['date'].dt.day, data_df['date'].dt.isocalendar().week
data_df.head()

Unnamed: 0,date,close,high,low,open,volume,year,month,day,week
0,2015-01-01 00:00:00+05:30,319.55,322.5,316.25,319.0,1456204,2015,1,1,1
1,2015-01-02 00:00:00+05:30,319.35,325.8,318.05,319.35,2894058,2015,1,2,1
2,2015-01-05 00:00:00+05:30,323.8,327.5,319.35,320.45,2099786,2015,1,5,2
3,2015-01-06 00:00:00+05:30,321.85,331.45,315.6,321.64,3672197,2015,1,6,2
4,2015-01-07 00:00:00+05:30,321.1,328.7,317.39,321.95,2981544,2015,1,7,2


In [43]:
data_df['day_profit'] = data_df['close'].diff(1)
data_df.iloc[0, data_df.columns.get_loc("day_profit")] = 0
data_df.head()

Unnamed: 0,date,close,high,low,open,volume,year,month,day,week,day_profit
0,2015-01-01 00:00:00+05:30,319.55,322.5,316.25,319.0,1456204,2015,1,1,1,0.0
1,2015-01-02 00:00:00+05:30,319.35,325.8,318.05,319.35,2894058,2015,1,2,1,-0.2
2,2015-01-05 00:00:00+05:30,323.8,327.5,319.35,320.45,2099786,2015,1,5,2,4.45
3,2015-01-06 00:00:00+05:30,321.85,331.45,315.6,321.64,3672197,2015,1,6,2,-1.95
4,2015-01-07 00:00:00+05:30,321.1,328.7,317.39,321.95,2981544,2015,1,7,2,-0.75


In [80]:
class StatsCalculation:

    def __init__(self):
        self.reset()

    def reset(self):
        self.win_streak = 0
        self.lose_streak = 0

    def get_streaks(self, input_data):
        if input_data:
            self.win_streak += 1
            self.lose_streak = 0
        else:
            self.lose_streak += 1
            self.win_streak = 0
        return self.win_streak, self.lose_streak

    def maximum_info(self, input_df, column_name):
        return round(input_df[column_name].max(), 2), input_df.loc[input_df[column_name].idxmax(), 'date']

    def minimum_info(self, input_df, column_name):
        return round(input_df[column_name].min(), 2), input_df.loc[input_df[column_name].idxmin(), 'date']

    def mean_info(self, input_df, column_name):
        return round(input_df[column_name].mean(), 2)

    def median_info(self, input_df, column_name):
        return round(input_df[column_name].median(), 2)

    def max_streaks(self, input_df, column_name='win_streaks'):
        return round(input_df[column_name].max(), 2), input_df.loc[input_df[column_name].idxmax(), 'date']

    def get_stats(self, input_df, column_name='close'):
        output_dict = {}
        self.reset()
        temp = input_df["profit_flag"].apply(lambda x: self.get_streaks(x))
        input_df["win_streaks"] = temp.apply(lambda x: x[0])
        input_df["lose_streaks"] = temp.apply(lambda x: x[1])
        del temp
        output_dict['MinimumValue'], output_dict['MinimumValueDate'] = self.minimum_info(input_df, column_name)
        output_dict['MaximumValue'], output_dict['MaximumValueDate'] = self.maximum_info(input_df, column_name)
        output_dict['MeanValue'] = self.mean_info(input_df, column_name)
        output_dict['MedianValue'] = self.median_info(input_df, column_name)
        output_dict['MaxWinStreaks'], output_dict['MaxWinStreaksDate'] = self.max_streaks(input_df, "win_streaks")
        output_dict['MaxLoseStreaks'], output_dict['MaxLoseStreaksDate'] = self.max_streaks(input_df, "lose_streaks")
        return output_dict


In [86]:
statcalc = StatsCalculation()
yearly_dict = {}
data_df['year'],data_df['month'],data_df['day'],data_df['week'] =  data_df['date'].dt.year, data_df['date'].dt.month, data_df['date'].dt.day, data_df['date'].dt.isocalendar().week
for this_year in data_df['year'].unique():
    this_year_df = data_df[data_df['year'] == this_year]
    data_df['day_profit'] = data_df['close'].diff(1).fillna(0)
    this_year_df["profit_flag"] = this_year_df['day_profit'].apply(lambda x: int(x> 0))
    yearly_dict[this_year] = statcalc.get_stats(this_year_df, 'close')
pd.DataFrame(yearly_dict)

Unnamed: 0,2015,2016,2017,2018,2019,2020,2021,2022
MinimumValue,241.15,171.55,273.8,301.8,323.5,207.8,496.8,654.85
MinimumValueDate,2015-12-11 00:00:00+05:30,2016-05-19 00:00:00+05:30,2017-01-02 00:00:00+05:30,2018-10-05 00:00:00+05:30,2019-02-26 00:00:00+05:30,2020-03-23 00:00:00+05:30,2021-01-06 00:00:00+05:30,2022-02-24 00:00:00+05:30
MaximumValue,368.55,312.8,437.55,448.75,426.4,485.4,878.6,970.25
MaximumValueDate,2015-08-20 00:00:00+05:30,2016-10-25 00:00:00+05:30,2017-11-06 00:00:00+05:30,2018-01-24 00:00:00+05:30,2019-06-14 00:00:00+05:30,2020-12-30 00:00:00+05:30,2021-06-07 00:00:00+05:30,2022-09-20 00:00:00+05:30
MeanValue,314.57,238.7,360.06,376.93,380.33,349.88,709.48,772.74
MedianValue,319.14,233.6,366.58,377.25,378.1,350.15,725.82,752.85
MaxWinStreaks,7,5,6,7,5,7,6,7
MaxWinStreaksDate,2015-08-06 00:00:00+05:30,2016-07-25 00:00:00+05:30,2017-04-05 00:00:00+05:30,2018-11-26 00:00:00+05:30,2019-03-13 00:00:00+05:30,2020-08-17 00:00:00+05:30,2021-02-18 00:00:00+05:30,2022-04-06 00:00:00+05:30
MaxLoseStreaks,8,6,5,9,6,7,8,6
MaxLoseStreaksDate,2015-11-05 00:00:00+05:30,2016-01-11 00:00:00+05:30,2017-02-17 00:00:00+05:30,2018-05-24 00:00:00+05:30,2019-01-28 00:00:00+05:30,2020-03-02 00:00:00+05:30,2021-06-17 00:00:00+05:30,2022-01-24 00:00:00+05:30


# Closest Stocks

In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
stock_data_folder = r"E:\PythonCodes\StockAnalysis\Data\IndianStocks_IntradayData"
file_list = [os.path.join(stock_data_folder, x) for x in os.listdir(stock_data_folder) if x.endswith('_day_data.csv')]

In [3]:
all_stock_df = pd.DataFrame()
for i in range(0, len(file_list)):
    data_df = pd.read_csv(file_list[i])
    data_df = data_df.drop(data_df.columns[0], axis=1)
    data_df['date'] = pd.to_datetime(data_df['date'])
    data_df.set_index('date', inplace=True)
    stock_variation = data_df['close'].pct_change().fillna(0)
    stock_variation.name = os.path.basename(file_list[i]).split('_')[0]
    all_stock_df = pd.concat((all_stock_df, stock_variation), axis=1)
all_stock_df.fillna(0, inplace=True)
all_stock_df.head()

Unnamed: 0,ACC,ADANIENT,ADANIGREEN,ADANIPORTS,AMBUJACEM,APOLLOHOSP,ASIANPAINT,AUROPHARMA,AXISBANK,BAJAJ-AUTO,...,TATASTEEL,TCS,TECHM,TITAN,TORNTPHARM,ULTRACEMCO,UPL,VEDL,WIPRO,YESBANK
2015-01-01 00:00:00+05:30,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2015-01-02 00:00:00+05:30,0.015924,0.004898,0.0,-0.000626,0.01405,0.001723,0.034573,0.004791,0.022344,-0.000163,...,0.014833,0.013309,0.004887,0.007899,0.04103,0.029897,0.031175,0.013911,0.00886,0.025435
2015-01-05 00:00:00+05:30,0.005435,0.016552,0.0,0.013935,-0.001299,0.003325,-6.5e-05,-0.000477,0.006319,0.003646,...,0.014616,-0.015189,-0.016677,0.008098,-0.004444,0.002042,-0.018735,-0.011246,0.001814,0.002146
2015-01-06 00:00:00+05:30,-0.034946,-0.007592,0.0,-0.006022,-0.036852,-0.029772,-0.023871,-0.039012,-0.035727,-0.007799,...,-0.048019,-0.03687,-0.010734,-0.034206,-0.045991,-0.02806,-0.048384,-0.048908,-0.023393,-0.028026
2015-01-07 00:00:00+05:30,-0.011637,-0.008858,0.0,-0.00233,-0.009228,0.007833,0.020093,0.015867,-0.000802,0.002331,...,-0.018916,-0.011812,-0.004668,0.005098,-0.005386,-0.00395,-0.018012,-0.000239,-0.00761,-0.018532


In [None]:
def cosine_similary():
    return 1 - 

In [None]:
all_stock_df.apply(lambda x: )