In [263]:
# Importing libraries to be used
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [264]:
# Reading csv data file and converting it to pandas dataframe
data = pd.read_csv('Data.csv')
data = pd.DataFrame(data)

In [181]:
# Correlation matrix of target features with Credibility
corr_matrix = data.corr()
print(corr_matrix["CREDIBILITY"].sort_values(ascending=False))

CREDIBILITY                        1.000000
MARKETCAP                          0.031100
[0,1]                              0.016830
EPS                                0.016124
target_price_move_10               0.015324
[-1,0]                             0.015063
[4,5]                              0.013555
CASH                               0.013043
WEEKHIGH52                         0.011016
[5,6]                              0.007690
[3,4]                              0.001413
NETMARGINS                        -0.001792
diff_target_SP500_price_move_10   -0.005156
PE                                -0.005832
WEEKLOW52                         -0.007362
[1,2]                             -0.009953
LASTTRADE                         -0.011580
AVERAGEVOLUME                     -0.012109
RELIABILITY                       -0.013433
target_price_move_3               -0.014642
Unnamed: 0                        -0.015728
[2,3]                             -0.016685
[-2,-1]                         

In [21]:
# Correlation matrix of target features with Reliability
corr_matrix = data.corr()
print(corr_matrix["RELIABILITY"].sort_values(ascending=False))

RELIABILITY                        1.000000
GROSSMARGINS                       0.025248
EPS                                0.018333
[3,4]                              0.012775
LASTTRADE                          0.009898
WEEKHIGH52                         0.009410
AVERAGEVOLUME                      0.008485
[2,3]                              0.007447
[5,6]                              0.007163
PE                                 0.006358
[0,1]                              0.005689
WEEKLOW52                          0.005310
[4,5]                              0.003026
diff_target_SP500_price_move_3     0.001766
Unnamed: 0                        -0.000301
diff_target_SP500_price_move_10   -0.002219
target_price_move_5               -0.003250
target_price_move_3               -0.008775
[-1,0]                            -0.009314
NETMARGINS                        -0.012330
diff_target_SP500_price_move_5    -0.013389
CREDIBILITY                       -0.013433
MARKETCAP                       

In [265]:
# Credibility and reliability are major factors indicating whether a rumor is true or not
# Calculating Weighted average of both the columns into a new column and dropping them
# Reliability is given slight more weight because if a source is reliable it will be credible but vice versa is not true always
data['avg_reliability'] = (4*data['CREDIBILITY']+5*data['RELIABILITY'])/9
data = data.drop(['CREDIBILITY','RELIABILITY'],axis=1)

In [16]:
# Correlation matrix of target features with Average Reliability
corr_matrix = data.corr()
print(corr_matrix["avg_reliability"].sort_values(ascending=False))

avg_reliability                    1.000000
EPS                                0.024344
[0,1]                              0.016574
WEEKHIGH52                         0.014585
[4,5]                              0.012326
MARKETCAP                          0.011860
[5,6]                              0.010572
[3,4]                              0.009472
[-1,0]                             0.005368
PE                                -0.000270
WEEKLOW52                         -0.002126
LASTTRADE                         -0.002329
CASH                              -0.002353
AVERAGEVOLUME                     -0.003660
diff_target_SP500_price_move_10   -0.005390
[2,3]                             -0.007832
NETMARGINS                        -0.009468
Unnamed: 0                        -0.012194
diff_target_SP500_price_move_3    -0.014522
GROSSMARGINS                      -0.014860
target_price_move_3               -0.016933
target_price_move_10              -0.021081
[1,2]                           

In [266]:
# Feature Extraction and dimensionality reduction on the basis of correlation and observed trends/concepts
# Dropping less important columns and performing weighted average of similar columns

data = data.drop(['[6,7]','[7,8]','[8,9]','[9,10]','diff_target_SP500_price_move_10','diff_target_SP500_price_move_5','diff_target_SP500_price_move_3'],axis=1)

data['avg_target_price_move'] = (2*data['target_price_move_3']+3*data['target_price_move_5']+data['target_price_move_10'])/6
data = data.drop(['target_price_move_3','target_price_move_5','target_price_move_10'],axis=1)

data['[-2,0]'] = data['[-1,0]']        # data['[-2,-1]'] almost all zero values so neglected 
data = data.drop(['Unnamed: 0','[-2,-1]','[-1,0]'],axis=1)

data['[0,3]'] = (data['[0,1]']+data['[1,2]']+data['[2,3]'])/3
data = data.drop(['[0,1]','[1,2]','[2,3]'],axis=1)

data['[3,6]'] = (data['[3,4]']+data['[4,5]'])/2       # data['[5,6]'] almost all zero values so neglected 
data = data.drop(['[3,4]','[4,5]','[5,6]'],axis=1)

In [267]:
data.shape

(3833, 16)

In [268]:
# listing all columns by names
cols = list(data.columns.values)
cols

['RumorDate',
 'MARKETCAP',
 'LASTTRADE',
 'WEEKHIGH52',
 'WEEKLOW52',
 'CASH',
 'EPS',
 'PE',
 'GROSSMARGINS',
 'NETMARGINS',
 'AVERAGEVOLUME',
 'avg_reliability',
 'avg_target_price_move',
 '[-2,0]',
 '[0,3]',
 '[3,6]']

In [269]:
# rearranging dataframe
data = data[['RumorDate','MARKETCAP','LASTTRADE','WEEKHIGH52','WEEKLOW52','CASH','EPS','PE','GROSSMARGINS','NETMARGINS','AVERAGEVOLUME','avg_target_price_move','[-2,0]','[0,3]','[3,6]','avg_reliability']]
data = data.drop(['LASTTRADE','GROSSMARGINS','WEEKHIGH52','WEEKLOW52'],axis=1)

# extracting month out of date column and dropping it
data['RumorMonth'] = pd.DatetimeIndex(data['RumorDate']).month
data = data.drop(['RumorDate'],axis=1)

# again rearranging
data = data[['RumorMonth','MARKETCAP','CASH','EPS','PE','NETMARGINS','AVERAGEVOLUME','avg_target_price_move','[-2,0]','[0,3]','[3,6]','avg_reliability']]
data.head()

Unnamed: 0,RumorMonth,MARKETCAP,CASH,EPS,PE,NETMARGINS,AVERAGEVOLUME,avg_target_price_move,"[-2,0]","[0,3]","[3,6]",avg_reliability
0,6,0.001867,0.006359,0.200737,0.00401,0.503122,0.002178,0.091323,0.0,0.169819,0.245271,0.26286
1,10,0.000436,0.015291,0.200244,0.003134,0.101977,0.005157,0.11496,0.0,0.169819,0.245271,0.750318
2,3,8.2e-05,0.003331,0.200207,0.001419,0.0,0.037541,0.129356,0.0,0.248192,0.127712,0.24186
3,3,0.003343,0.00757,0.199467,0.001077,0.477627,0.007154,0.117686,0.0,0.248192,0.127712,0.255077
4,12,0.01339,0.015291,0.197011,0.002795,0.543184,0.007078,0.134985,0.0,0.247405,0.128892,0.161592


In [270]:
# datatypes of columns
data.dtypes

RumorMonth                 int64
MARKETCAP                float64
CASH                     float64
EPS                      float64
PE                       float64
NETMARGINS               float64
AVERAGEVOLUME            float64
avg_target_price_move    float64
[-2,0]                   float64
[0,3]                    float64
[3,6]                    float64
avg_reliability          float64
dtype: object

In [277]:
# Importing regression models from sklearn library 
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso

# Training of algorithms (Linear/Lasso Regression)
# Training algorithm for each month to get the regression output, to get the mark of Top100 rumors
# i.e. rumors having score greater than mark of each month will lie in top 100

# Initialising mark of each month
mark = [None]*12

# Coefficient matrix to store coefficients of model of each month
reg = [[0 for x in range(10)] for y in range(12)] 

months = [1,2,3,4,5,6,7,8,9,10,11,12]

# iterating over each month to get top100 rumors of each month
for i in months:

    data = data[(data['RumorMonth'] == i)]
    
    # Creating dataset to train of ith month
    X = data.iloc[:,1:11]
    Y = data.iloc[:,11]
    
    # Training model(Linear Regression)         # Lasso Regression   
    lin_reg = LinearRegression()                # lassoReg = Lasso(alpha=0.01, normalize=True)
    reg[i-1] = lin_reg.fit(X, Y)                # reg[i-1] = lassoReg.fit(X,Y)
    
    # Predicting 
    Y_pred = lin_reg.predict(X)                 # Y_pred = lassoReg.predict(X)
    
    # Sorting predictions in descending order to get top100 rumors and mark
    sorted_array = np.sort(Y_pred)
    Y_pred = sorted_array[::-1]       # reversing sort
    
    # Assigning mark of each month to classify rumors of upcoming month
    if(len(Y_pred) >= 100):         
        mark[i-1] = Y_pred[100]             # If rumors in ith month is more than 100 than mark will be the 100th rumor
    else:
        mark[i-1] = Y_pred[len(Y_pred)-1]   # If rumors in ith month is less than 100 than mark will be the last rumor
                

In [272]:
# Function to predict rumors of upcoming months whether they will lie in top100 rumors of that month or not
def test_prediction(data):
    
    data = pd.DataFrame(data)
    data['avg_reliability'] = (4*data['CREDIBILITY']+5*data['RELIABILITY'])/9
    data = data.drop(['CREDIBILITY','RELIABILITY'],axis=1)
    
    data = data.drop(['[6,7]','[7,8]','[8,9]','[9,10]','diff_target_SP500_price_move_10','diff_target_SP500_price_move_5','diff_target_SP500_price_move_3'],axis=1)    
    data['avg_target_price_move'] = (2*data['target_price_move_3']+3*data['target_price_move_5']+data['target_price_move_10'])/6
    data = data.drop(['target_price_move_3','target_price_move_5','target_price_move_10'],axis=1)

    data['[-2,0]'] = (data['[-2,-1]']+data['[-1,0]'])/2
    data = data.drop(['Unnamed: 0','[-2,-1]','[-1,0]'],axis=1)

    data['[0,3]'] = (data['[0,1]']+data['[1,2]']+data['[2,3]'])/3
    data = data.drop(['[0,1]','[1,2]','[2,3]'],axis=1)

    data['[3,6]'] = (data['[3,4]']+data['[4,5]']+data['[5,6]'])/3
    data = data.drop(['[3,4]','[4,5]','[5,6]'],axis=1)
    
    data = data[['RumorDate','MARKETCAP','LASTTRADE','WEEKHIGH52','WEEKLOW52','CASH','EPS','PE','GROSSMARGINS','NETMARGINS','AVERAGEVOLUME','avg_target_price_move','[-2,0]','[0,3]','[3,6]','avg_reliability']]
    data = data.drop(['LASTTRADE','GROSSMARGINS','PE','WEEKLOW52'],axis=1)

    data['RumorMonth'] = pd.DatetimeIndex(data['RumorDate']).month
    data = data.drop(['RumorDate'],axis=1)

    data = data[['RumorMonth','MARKETCAP','WEEKHIGH52','CASH','EPS','NETMARGINS','AVERAGEVOLUME','avg_target_price_move','[-2,0]','[0,3]','[3,6]','avg_reliability']]
    
    
    months = [1,2,3,4,5,6,7,8,9,10,11,12]
    
    for i in months:
        data = data[data['RumorMonth'] == i]
        x = data.iloc[:,1:11]
        y_pred = reg[i-1].predict(x) # Using trained model coefficients of ith month to predict
        if(y_pred >= mark[i-1]):     # Comparing the prediction with the mark
            print('The Rumor will be in Top 100 rumors.')
        else:
            print('The Rumor will not be in Top 100 rumors.')
    