# Dependencies

In [16]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import MinMaxScaler
pd.set_option("display.max_rows", None, "display.max_columns", None)

# Creating/Cleaning Training Data

In [44]:
training_df = pd.read_csv('training_data.csv', index_col=0)

training_df['Buy'] = ""
buy_classification = ['buy', 'outperform', 'positive']

for index, row in training_df.iterrows():
    ticker = row['Ticker'].lower()
    analyst_suggestions = pd.read_csv('../Data_Collection/Scrapers/Final_Analyst_Rating.csv', index_col=0)
    for inner_index, inner_row in analyst_suggestions.iterrows():
        if analyst_suggestions.at[inner_index, 'Ticker'].lower() == ticker:
            if analyst_suggestions.at[inner_index, 'Rating'].lower() in buy_classification:
                training_df.at[index, 'Buy'] = 1
            else:
                training_df.at[index, 'Buy'] = 0

# Scaling the data
Scaler = MinMaxScaler()
cols = [x for x in training_df.columns if x not in ['Buy', 'Ticker']]
training_df[cols] = Scaler.fit_transform(training_df[cols])
training_df = training_df.dropna(axis='columns')

training_df

Unnamed: 0,Ticker,Conversations,Headlines,regularMarketVolume,profitMargins,sharesShort,sharesPercentSharesOut,heldPercentInstitutions,heldPercentInsiders,mostRecentQuarter,nextFiscalYearEnd,shortRatio,enterpriseValue,sharesShortPriorMonth,Buy
0,KHC,0.755495,0.346059,0.05607,0.694382,0.173293,0.102862,0.689797,0.027648,0.716667,0.882353,0.320091,0.15538,0.239418,0
1,BIIB,0.692308,0.823892,0.046038,0.833823,0.01992,0.12529,0.767212,0.010343,0.75,0.898693,0.273553,0.143176,0.030156,0
2,JPM,0.346154,0.752463,0.156777,0.899838,0.135292,0.00464,0.605627,0.014483,0.75,0.898693,0.053348,0.0,0.184657,0
3,GRPN,0.546703,0.424877,0.004635,0.55161,0.016036,0.692189,0.505289,0.367437,0.0,0.898693,0.205448,0.124995,0.019889,0
4,OXY,0.681319,0.325123,0.123771,0.149787,0.174616,0.149265,0.530112,0.003549,0.0,0.898693,0.079455,0.152936,0.238445,1
5,NVDA,0.601648,0.564039,0.118436,0.84713,0.068236,0.075793,0.552491,0.068377,0.258333,1.0,0.065834,0.265575,0.05557,1
6,GPRO,0.728022,0.508621,0.041149,0.63353,0.105761,0.67208,0.555125,0.01979,0.0,0.898693,0.175936,0.125176,0.098048,0
7,GRUB,0.263736,0.0,0.112177,0.618404,0.073125,0.789637,0.991222,0.002028,0.75,0.898693,0.748014,0.127078,0.081507,0
8,HD,0.516484,0.54064,0.029226,0.743477,0.067579,0.026295,0.591749,0.000676,0.258333,1.0,0.200908,0.275071,0.085851,1
9,MSFT,0.766484,0.37931,0.218991,0.90481,0.474708,0.021655,0.597527,0.000862,0.75,0.297386,0.174801,0.854646,0.443091,0


# Training the model

In [54]:
LR = LogisticRegression()
training_cols = [x for x in training_df.columns if x not in ['Buy', 'Ticker']]

X_train = training_df[training_cols]
y_train = training_df['Buy']
y_train = y_train.astype('int')

LR = LR.fit(X_train, y_train)
y_predictions = LR.predict(X_train)

print(classification_report(y_train, y_predictions))

              precision    recall  f1-score   support

           0       0.68      0.90      0.77        51
           1       0.74      0.39      0.51        36

    accuracy                           0.69        87
   macro avg       0.71      0.65      0.64        87
weighted avg       0.70      0.69      0.66        87

