In [7]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression

from sklearn.multiclass import OneVsRestClassifier

from sklearn.metrics import confusion_matrix

In [8]:
df = pd.read_csv('aggregated_companies_ts.csv')
df['sunday_beginning'] = pd.to_datetime(df['sunday_beginning'])

dates_match = pd.read_csv('date_match.csv')
dates_match['previous_sunday'] = pd.to_datetime(dates_match['previous_sunday'])

df = df.merge(dates_match,how='left',left_on=['conm','sunday_beginning'],right_on=['conm','previous_sunday'])

df.drop('Qtly_Announcement',axis=1,inplace=True)

df.to_csv('test.csv',index=False)

IOError: File aggregated_companies_ts.csv does not exist

In [None]:
df['positive_surprise'] = (df['weekly_pct']>0.04).astype(int)
df['negative_surprise'] = (df['weekly_pct']<-0.04).astype(int)
df['surprises'] = np.where(df['positive_surprise']==1,1,np.where(df['negative_surprise']==1,-1,0))

In [None]:
df = df[df['announcement']==1]
df.to_csv('test.csv',index=False)

In [None]:
df['sunday_beginning'] = pd.to_datetime(df['sunday_beginning'],format='%d/%m/%y')
df.sort_values(by='sunday_beginning',inplace=True)
df.fillna(0,inplace=True)
df.head()

split = np.floor(0.7*df.shape[0]).astype(int)

df_train = df.iloc[:split,:]
df_test = df.iloc[split:,:]

#### Positive Surprise

In [None]:
x = df_train[['six','five','four','three','two','one']]
y = df_train['positive_surprise']

x_test = df_test[['six','five','four','three','two','one']]
y_test = df_test['positive_surprise']

In [None]:
x.shape

In [None]:
lr = LogisticRegression(class_weight='balanced')

lr.fit(x,y)

In [None]:
preds = lr.predict_proba(x_test)
preds = pd.DataFrame(preds)
preds = preds[1]
preds = pd.DataFrame(preds)
preds

In [None]:
df_test = pd.DataFrame(df_test)
df_test.reset_index(inplace=True,drop=True)
df_test

In [None]:
results = df_test.join(preds)
results = results.rename(columns={1:'preds'})
results.to_csv('results_surprises_positives.csv',index=False)

#### Negative Surprises

In [None]:
x = df_train[['six','five','four','three','two','one']]
y = df_train['negative_surprise']

x_test = df_test[['six','five','four','three','two','one']]
y_test = df_test['negative_surprise']

In [None]:
lr = LogisticRegression(class_weight='balanced')

lr.fit(x,y)

In [None]:
preds = lr.predict_proba(x_test)
preds = pd.DataFrame(preds)
preds = preds[1]
preds = pd.DataFrame(preds)
preds

In [None]:
df_test = pd.DataFrame(df_test)
df_test.reset_index(inplace=True,drop=True)
df_test

In [None]:
results = df_test.join(preds)
results = results.rename(columns={1:'preds'})
results.to_csv('results_surprises_negative.csv',index=False)

#### Multinomial

In [None]:
x = df_train[['six','five','four','three','two','one']]
y = df_train['surprises']

x_test = df_test[['six','five','four','three','two','one']]
y_test = df_test['surprises']

In [None]:
lr = LogisticRegression(class_weight='balanced',multi_class='multinomial',solver='newton-cg')

#lr = LogisticRegression(multi_class='multinomial',solver='newton-cg')

lr.fit(x,y)

In [None]:
preds = lr.predict_proba(x_test)
preds = pd.DataFrame(preds)
#preds = preds[1]
#preds = pd.DataFrame(preds)
preds

In [None]:
df_test = pd.DataFrame(df_test)
df_test.reset_index(inplace=True,drop=True)
df_test

In [None]:
results = df_test.join(preds)
results = results.rename(columns={0:'neg_prob',1:'neutral_prob',2:'pos_prob'})
results.to_csv('results_surprises_multinomial.csv',index=False)

In [None]:
rows = []

for idx,row in enumerate(results['six']):
    row = results.loc[idx]
    row['pos_pred'] = np.where(row['pos_prob']>max(row['neutral_prob'],row['neg_prob']),1,0)
    row['neg_pred'] = np.where(row['neg_prob']>max(row['neutral_prob'],row['pos_prob']),1,0)
    row['surprise_prediction'] = 0
    row['surprise_prediction'] = np.where(row['neg_prob']>max(row['neutral_prob'],row['pos_prob']),-1,row['surprise_prediction'])
    row['surprise_prediction'] = np.where(row['pos_prob']>max(row['neutral_prob'],row['neg_prob']),1,row['surprise_prediction'])
    rows.append(row)
    
rows = pd.DataFrame(rows)
rows

In [None]:
rows['surprises'] = rows['surprises'].astype(int)
rows['surprise_prediction'] = rows['surprise_prediction'].astype(int)

#rows['surprises'] = np.where(rows['surprises']==-1,2,rows['surprises'])
#rows['surprise_prediction'] = np.where(rows['surprise_prediction']==-1,2,rows['surprise_prediction'])
#rows.to_csv('test.csv',index=False)

print confusion_matrix(rows['surprises'],rows['surprise_prediction'])

#### OVA Models

In [None]:
ova = OneVsRestClassifier(estimator=LogisticRegression(class_weight='balanced'))

In [None]:
ova.fit(x,y)

In [None]:
preds = ova.predict_proba(x_test)
preds = pd.DataFrame(preds)
preds.rename(columns={0:'negative',1:'neutral',2:'positive'},inplace=True)
preds

In [None]:
df_test.reset_index(inplace=True,drop=True)
df_test

In [None]:
df_test1 = df_test.join(preds)
df_test1.to_csv('surprises_ova.csv',index=False)

In [None]:
df_test1

In [None]:
rows = []

for idx,row in enumerate(df_test1['six']):
    row = df_test1.loc[idx]
    row['pos_pred'] = np.where(row['positive']>max(row['neutral'],row['negative']),1,0)
    row['neg_pred'] = np.where(row['negative']>max(row['neutral'],row['positive']),1,0)
    row['surprise_prediction'] = 0
    row['surprise_prediction'] = np.where(row['negative']>max(row['neutral'],row['positive']),-1,row['surprise_prediction'])
    row['surprise_prediction'] = np.where(row['positive']>max(row['neutral'],row['negative']),1,row['surprise_prediction'])
    rows.append(row)
    
rows = pd.DataFrame(rows)
rows

In [None]:
rows['surprises'] = rows['surprises'].astype(int)
rows['surprise_prediction'] = rows['surprise_prediction'].astype(int)

rows.to_csv('surprises_ova.csv',index=False)

#rows['surprises'] = np.where(rows['surprises']==-1,2,rows['surprises'])
#rows['surprise_prediction'] = np.where(rows['surprise_prediction']==-1,2,rows['surprise_prediction'])
#rows.to_csv('test.csv',index=False)

print confusion_matrix(rows['surprises'],rows['surprise_prediction'])