# Installing the necessary library

In [1]:
!pip install --user sklearn



# Importing the necessary modules and libraries 

In [2]:
#importing necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier

# Creating and processing the DataFrame

In [3]:
#creating data frame
path = r"C:\Users\velin\OneDrive\Desktop\dss\assignemnt 3\Final dataset.xlsx"# insert the path here.
df = pd.read_excel(path)
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Adjusted Close,Volume,ts_polarity,twitter_volume
0,2016-01-04,25.6525,26.342501,25.5,26.3375,24.251434,270597600,0.070389,1133
1,2016-01-05,26.4375,26.4625,25.602501,25.6775,23.643711,223164000,0.133635,1430
2,2016-01-06,25.139999,25.592501,24.967501,25.174999,23.181013,273829600,0.072042,1949
3,2016-01-07,24.67,25.032499,24.1075,24.112499,22.202665,324377600,0.074369,2289
4,2016-01-08,24.637501,24.7775,24.190001,24.24,22.32007,283192000,0.051595,2235


In [4]:
#adjusting data frame to just the data we need
apple_df = df[['Date', 'Adjusted Close', 'Volume', 'ts_polarity', 'twitter_volume']]
apple_df.set_index("Date", inplace = True)
apple_df.head()

Unnamed: 0_level_0,Adjusted Close,Volume,ts_polarity,twitter_volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2016-01-04,24.251434,270597600,0.070389,1133
2016-01-05,23.643711,223164000,0.133635,1430
2016-01-06,23.181013,273829600,0.072042,1949
2016-01-07,22.202665,324377600,0.074369,2289
2016-01-08,22.32007,283192000,0.051595,2235


In [5]:
#sorting TS_polarity into positive and negative 
sentiment = []
for ts in apple_df["ts_polarity"]:
    if ts >= 0.05:
        sentiment.append("positive")
    elif ts <= - 0.05:
        sentiment.append("negative")
    else:
        sentiment.append("neutral")
apple_df["sentiment"] = sentiment
apple_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  apple_df["sentiment"] = sentiment


Unnamed: 0_level_0,Adjusted Close,Volume,ts_polarity,twitter_volume,sentiment
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2016-01-04,24.251434,270597600,0.070389,1133,positive
2016-01-05,23.643711,223164000,0.133635,1430,positive
2016-01-06,23.181013,273829600,0.072042,1949,positive
2016-01-07,22.202665,324377600,0.074369,2289,positive
2016-01-08,22.32007,283192000,0.051595,2235,positive


In [6]:
# Determining the stock trend based on diff in prices
apple_df["price diff"] = apple_df["Adjusted Close"].diff()
apple_df.dropna(inplace = True)
apple_df["trend"] = np.where(apple_df["price diff"] > 0, 1, 0)

apple_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  apple_df["price diff"] = apple_df["Adjusted Close"].diff()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  apple_df.dropna(inplace = True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  apple_df["trend"] = np.where(apple_df["price diff"] > 0, 1, 0)


Unnamed: 0_level_0,Adjusted Close,Volume,ts_polarity,twitter_volume,sentiment,price diff,trend
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2016-01-05,23.643711,223164000,0.133635,1430,positive,-0.607723,0
2016-01-06,23.181013,273829600,0.072042,1949,positive,-0.462698,0
2016-01-07,22.202665,324377600,0.074369,2289,positive,-0.978348,0
2016-01-08,22.32007,283192000,0.051595,2235,positive,0.117405,1
2016-01-11,22.68148,198957600,0.019443,1222,neutral,0.36141,1


In [7]:
#Creating dummy variables for Neg, pos or neutral sentiment
new_apple_df = apple_df[["Adjusted Close", "Volume", "twitter_volume", "sentiment", "trend"]]
new_apple_df = pd.get_dummies(new_apple_df, columns = ["sentiment"])
new_apple_df.head()

Unnamed: 0_level_0,Adjusted Close,Volume,twitter_volume,trend,sentiment_negative,sentiment_neutral,sentiment_positive
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2016-01-05,23.643711,223164000,1430,0,0,0,1
2016-01-06,23.181013,273829600,1949,0,0,0,1
2016-01-07,22.202665,324377600,2289,0,0,0,1
2016-01-08,22.32007,283192000,2235,1,0,0,1
2016-01-11,22.68148,198957600,1222,1,0,1,0


# Creating the feature set 

In [8]:
#Defining feature set for the model
x = new_apple_df.copy()
x.drop("trend", axis = 1, inplace = True)
x.head()

Unnamed: 0_level_0,Adjusted Close,Volume,twitter_volume,sentiment_negative,sentiment_neutral,sentiment_positive
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2016-01-05,23.643711,223164000,1430,0,0,1
2016-01-06,23.181013,273829600,1949,0,0,1
2016-01-07,22.202665,324377600,2289,0,0,1
2016-01-08,22.32007,283192000,2235,0,0,1
2016-01-11,22.68148,198957600,1222,0,1,0


# Creating the dependent variable

In [9]:
#defining labels using trend
y = new_apple_df["trend"].values.reshape(-1, 1)
y[:5]

array([[0],
       [0],
       [0],
       [1],
       [1]])

# Splitting the data into train and test data and building the model

In [10]:
#split the data into training and test set for machine learning
split = int(0.75 * len(x))
x_train = x[ : split]
x_test = x[split : ]

y_train = y[ : split]
y_test = y[split : ]

In [11]:
#using standard scaler to normalize the feature set 
scaler = StandardScaler() #call standard scaler func
x_scaler = scaler.fit(x_train) #create scaler for x
x_train_norm = x_scaler.transform(x_train) #apply scaler to x_train
x_test_norm = x_scaler.transform(x_test) #apply scaler to x_test

In [12]:
#Create the Classifier model
rf_model = RandomForestClassifier(n_estimators=20)
rf_model = rf_model.fit(x_train_norm, y_train.ravel())

# Obtaining the prediction score, confusion matrix and classification report

In [13]:
#Obtaining accuracy metric
predictions = rf_model.predict(x_test_norm)
acc = accuracy_score(y_test, predictions)
print(f"Acc : {acc}")

Acc : 0.49783549783549785


In [14]:
#Creating a confusion matrix
matrix = confusion_matrix(y_test, predictions)
matrix

array([[39, 70],
       [46, 76]], dtype=int64)

In [15]:
# Generating classification report
report = classification_report(y_test, predictions)
print(report)

              precision    recall  f1-score   support

           0       0.46      0.36      0.40       109
           1       0.52      0.62      0.57       122

    accuracy                           0.50       231
   macro avg       0.49      0.49      0.48       231
weighted avg       0.49      0.50      0.49       231

