In [None]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


# Stock Market Data

In [None]:
import yfinance as yf

def download_stock_data(ticker, start, end, interval="1d"):
  df = yf.download(ticker, start=start, end=end, interval=interval)
  return df

In [None]:
stock_ticker = "TSLA"
start_date = "2021-09-30"
end_date = "2022-09-29"

stock_market_df = download_stock_data(stock_ticker, start_date, end_date)

save_path = '/content/drive/MyDrive/SDP-FINAL/TSLA_stock_data.csv'

stock_market_df.to_csv(save_path)

print()
print(stock_market_df)


[*********************100%***********************]  1 of 1 completed



Price            Close        High         Low        Open    Volume
Ticker            TSLA        TSLA        TSLA        TSLA      TSLA
Date                                                                
2021-09-30  258.493347  263.043335  258.333344  260.333344  53868000
2021-10-01  258.406677  260.260010  254.529999  259.466675  51094200
2021-10-04  260.510010  268.989990  258.706665  265.500000  91449900
2021-10-05  260.196655  265.769989  258.066681  261.600006  55297800
2021-10-06  260.916656  262.220001  257.739990  258.733337  43898400
...                ...         ...         ...         ...       ...
2022-09-22  288.589996  301.290009  285.820007  299.859985  70545400
2022-09-23  275.329987  284.500000  272.820007  283.089996  63748400
2022-09-26  276.010010  284.089996  270.309998  271.829987  58076900
2022-09-27  282.940002  288.670013  277.510010  283.839996  61925200
2022-09-28  287.809998  289.000000  277.570007  283.079987  54664800

[251 rows x 5 columns]


# Social Media Data

In [None]:
import pandas as pd

# Load the CSV file
file_path = "/content/drive/MyDrive/SDP-FINAL/kaggle_tweets.csv"  # Update with the correct file path
df = pd.read_csv(file_path)

# Convert the 'Date' column to datetime format
df['Date'] = pd.to_datetime(df['Date'])

# Filter out rows where the date is <= 2021-12-30
filtered_df = df[(df['Date'] > '2021-09-29') & (df['Stock Name'] == 'TSLA')]

filtered_df['Date'] = filtered_df['Date'].dt.date

filtered_file_path = "/content/drive/MyDrive/SDP-FINAL/TSLA_tweets.csv"  # Update as needed
filtered_df.to_csv(filtered_file_path, index=False)

print("Filtered data saved to:", filtered_file_path)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['Date'] = filtered_df['Date'].dt.date


Filtered data saved to: /content/drive/MyDrive/SDP-FINAL/TSLA_tweets.csv


# Sentiment Analysis

In [None]:
!pip install vaderSentiment

import pandas as pd
import re
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Initialize the VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()



In [None]:
# Function to get the sentiment label based on compound score
def get_sentiment_label(tweet):
    compound_score = analyzer.polarity_scores(tweet)['compound']
    if compound_score >= 0.05:
        return 'Positive'
    elif compound_score <= -0.05:
        return 'Negative'
    else:
        return 'Neutral'

In [None]:
def clean_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text).lower()
    return text

In [None]:
input_csv_path = '/content/drive/MyDrive/SDP-FINAL/TSLA_tweets.csv'  # Update with your CSV file path
df = pd.read_csv(input_csv_path)

# Display the first few rows to check the structure
print(df.head())

# Add the Sentiment column
df['Tweet'] = df['Tweet'].apply(lambda tweet: clean_text(tweet))
df['Sentiment'] = df['Tweet'].apply(lambda tweet: get_sentiment_label(tweet))

# Save the updated DataFrame to a new CSV file
output_csv_path = '/content/drive/MyDrive/SDP-FINAL/TSLA_sentiment_tweets.csv'
df.to_csv(output_csv_path, index=False)

print(f"Sentiment analysis complete. Data saved to {output_csv_path}")


         Date                                              Tweet Stock Name  \
0  2022-09-29  Mainstream media has done an amazing job at br...       TSLA   
1  2022-09-29  Tesla delivery estimates are at around 364k fr...       TSLA   
2  2022-09-29  3/ Even if I include 63.0M unvested RSUs as of...       TSLA   
3  2022-09-29  @RealDanODowd @WholeMarsBlog @Tesla Hahaha why...       TSLA   
4  2022-09-29  @RealDanODowd @Tesla Stop trying to kill kids,...       TSLA   

  Company Name  
0  Tesla, Inc.  
1  Tesla, Inc.  
2  Tesla, Inc.  
3  Tesla, Inc.  
4  Tesla, Inc.  
Sentiment analysis complete. Data saved to /content/drive/MyDrive/SDP-FINAL/TSLA_sentiment_tweets.csv


# Merge both dataset

In [None]:
import pandas as pd

# Load datasets
tweets_file_path = "/content/drive/MyDrive/SDP-FINAL/TSLA_sentiment_tweets.csv"
stock_file_path = "/content/drive/MyDrive/SDP-FINAL/TSLA_stock_data.csv"

tweets_df = pd.read_csv(tweets_file_path)
stock_df = pd.read_csv(stock_file_path, skiprows=1)  # Skip first row if necessary

# Rename stock columns
stock_df.columns = ["Date", "Close Price", "High Price", "Low Price", "Open Price", "Trading Volume"]

# Convert date columns to datetime format
tweets_df["Date"] = pd.to_datetime(tweets_df["Date"], errors='coerce')
stock_df["Date"] = pd.to_datetime(stock_df["Date"], errors='coerce')

# Convert stock price columns to numeric
for col in ["Close Price", "High Price", "Low Price", "Open Price", "Trading Volume"]:
    stock_df[col] = pd.to_numeric(stock_df[col], errors='coerce')

# Map sentiment to numerical values
sentiment_mapping = {"Positive": 1, "Neutral": 0, "Negative": -1}
tweets_df["Sentiment Score"] = tweets_df["Sentiment"].map(sentiment_mapping)

# Aggregate tweets per date
tweets_agg = tweets_df.groupby("Date").agg(
    Tweet_Count=("Sentiment Score", "count"),
    Avg_Tweet_Sentiment=("Sentiment Score", "mean")
).reset_index()

# Merge tweets and stock data
merged_df = pd.merge(stock_df, tweets_agg, on="Date", how="left")

# Fill missing Tweet data by propagating previous day's values
merged_df["Tweet_Count"].fillna(method="ffill", inplace=True)
merged_df["Avg_Tweet_Sentiment"].fillna(method="ffill", inplace=True)

# Compute target label (1 = Up, 0 = Down)
merged_df["Next Day Close Price"] = merged_df["Close Price"].shift(-1)
merged_df["Stock Movement"] = (merged_df["Next Day Close Price"] > merged_df["Close Price"]).astype(int)

# Drop extra column
merged_df.drop(columns=["Next Day Close Price"], inplace=True)

# Save the final dataset
merged_df.to_csv("/content/drive/MyDrive/SDP-FINAL/TSLA_dataset.csv", index=False)

print("Merged dataset saved as merged_tesla_data.csv")


Merged dataset saved as merged_tesla_data.csv


  stock_df["Date"] = pd.to_datetime(stock_df["Date"], errors='coerce')
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  merged_df["Tweet_Count"].fillna(method="ffill", inplace=True)
  merged_df["Tweet_Count"].fillna(method="ffill", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  merged_df["Avg_Tweet_Sentiment"].fi

# Data Preprocessing

In [None]:
import pandas as pd

# Load the CSV file
file_path = "/content/drive/MyDrive/SDP-ML/stock_tweets.csv"  # Update with the correct file path
df = pd.read_csv(file_path)

# Convert the 'Date' column to datetime format
df['Date'] = pd.to_datetime(df['Date'])

# Filter out rows where the date is <= 2021-12-30
filtered_df = df[(df['Date'] > '2021-09-29') & (df['Stock Name'] == 'TSLA')]

filtered_df['Date'] = filtered_df['Date'].dt.date

filtered_file_path = "/content/drive/MyDrive/SDP-ML/temp.csv"  # Update as needed
filtered_df.to_csv(filtered_file_path, index=False)

print("Filtered data saved to:", filtered_file_path)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['Date'] = filtered_df['Date'].dt.date


Filtered data saved to: /content/drive/MyDrive/SDP-ML/temp.csv


In [None]:
# add class label

import pandas as pd

# Load the CSV file
file_path = "/content/drive/MyDrive/SDP-ML/merged_tesla_data.csv"
df = pd.read_csv(file_path)

# Ensure the data is sorted by date
df = df.sort_values(by="Date").reset_index(drop=True)

# Create the 'Class' column based on the given condition
df['Class'] = (df['Stock Close Price'].shift(-1) > df['Stock Close Price']).astype(int)

# Save the modified DataFrame
output_path = "/content/drive/MyDrive/SDP-ML/final_telsa_data.csv"
df.to_csv(output_path, index=False)

print(f"Modified file saved to: {output_path}")


Modified file saved to: /content/drive/MyDrive/SDP-ML/final_telsa_data.csv


In [None]:
# merge dataset

import pandas as pd

# Load datasets
tweets_file_path = "/content/drive/MyDrive/SDP-ML/TSLA_sentiment_tweets.csv"
stock_file_path = "/content/drive/MyDrive/SDP-ML/TSLA_data.csv"

tweets_df = pd.read_csv(tweets_file_path)
stock_df = pd.read_csv(stock_file_path, skiprows=1)  # Skip first row if necessary

# Rename stock columns
stock_df.columns = ["Date", "Stock Close Price", "Stock High Price", "Stock Low Price", "Stock Open Price", "Stock Trading Volume"]

# Convert date columns to datetime format
tweets_df["Date"] = pd.to_datetime(tweets_df["Date"], errors='coerce')
stock_df["Date"] = pd.to_datetime(stock_df["Date"], errors='coerce')

# Convert stock price columns to numeric
for col in ["Stock Close Price", "Stock High Price", "Stock Low Price", "Stock Open Price", "Stock Trading Volume"]:
    stock_df[col] = pd.to_numeric(stock_df[col], errors='coerce')

# Map sentiment to numerical values
sentiment_mapping = {"Positive": 1, "Neutral": 0, "Negative": -1}
tweets_df["Sentiment Score"] = tweets_df["Sentiment"].map(sentiment_mapping)

# Aggregate tweets per date
tweets_agg = tweets_df.groupby("Date").agg(
    Tweet_Count=("Sentiment Score", "count"),
    Avg_Tweet_Sentiment=("Sentiment Score", "mean")
).reset_index()

# Merge tweets and stock data
merged_df = pd.merge(stock_df, tweets_agg, on="Date", how="left")

# Fill missing Tweet data by propagating previous day's values
merged_df["Tweet_Count"].fillna(method="ffill", inplace=True)
merged_df["Avg_Tweet_Sentiment"].fillna(method="ffill", inplace=True)

# Compute target label (1 = Up, 0 = Down)
merged_df["Next Day Close Price"] = merged_df["Stock Close Price"].shift(-1)
merged_df["Stock Movement"] = (merged_df["Next Day Close Price"] > merged_df["Stock Close Price"]).astype(int)

# Drop extra column
merged_df.drop(columns=["Next Day Close Price"], inplace=True)

# Save the final dataset
merged_df.to_csv("/content/drive/MyDrive/SDP-ML/final.csv", index=False)

print("Merged dataset saved as merged_tesla_data.csv")


Merged dataset saved as merged_tesla_data.csv


  stock_df["Date"] = pd.to_datetime(stock_df["Date"], errors='coerce')
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  merged_df["Tweet_Count"].fillna(method="ffill", inplace=True)
  merged_df["Tweet_Count"].fillna(method="ffill", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  merged_df["Avg_Tweet_Sentiment"].fi

# Train Model Using LogisticRegression

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Load dataset
file_path = "/content/drive/MyDrive/SDP-FINAL/TSLA_dataset.csv"  # Update this path if needed
df = pd.read_csv(file_path)

# Handle missing values (if any)
df.dropna(inplace=True)

# Define features (X) and target (y)
X = df.drop(columns=["Stock Movement", "Date"])  # Exclude target & non-numeric columns
y = df["Stock Movement"]  # Target column (0 = Down, 1 = Up)

# Feature Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-Test Split (80-20)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)

# Train Logistic Regression Model
model = LogisticRegression()
model.fit(X_train, y_train)

# Model Evaluation
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

# Print Results
print(f"Model Accuracy: {accuracy:.4f}")
print("Classification Report:\n", report)


Model Accuracy: 0.4706
Classification Report:
               precision    recall  f1-score   support

           0       0.41      0.29      0.34        24
           1       0.50      0.63      0.56        27

    accuracy                           0.47        51
   macro avg       0.46      0.46      0.45        51
weighted avg       0.46      0.47      0.46        51



# Train Model using RandomForestClassifier

In [48]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Load the dataset
df = pd.read_csv('/content/drive/MyDrive/SDP-FINAL/TSLA_dataset.csv')

# Drop the first row if it contains NaN values
df = df.dropna().reset_index(drop=True)

# Rename columns for consistency
df.rename(columns={
    'Close Price': 'Close',
    'High Price': 'High',
    'Low Price': 'Low',
    'Open Price': 'Open',
    'Trading Volume': 'Volume',
    'Avg_Tweet_Sentiment': 'Sentiment_Score',
    'Stock Movement': 'Target'
}, inplace=True)

# Calculate the percentage change in the closing price
df['Close_pct_change'] = df['Close'].pct_change()

# Shift the 'Close_pct_change' to create the target variable
df['Target'] = df['Close_pct_change'].shift(-1)
df['Target'] = np.where(df['Target'] > 0, 1, 0)  # 1 if price goes up, 0 otherwise

# Drop the last row (because the target is NaN)
df = df.iloc[:-1]

# Feature Selection
features = ['Sentiment_Score', 'Open', 'High', 'Low', 'Close', 'Volume', 'Close_pct_change']
target = 'Target'
X = df[features]
y = df[target]

# Handle NaN values (optional - depending on your data)
X = X.fillna(method='ffill')  # Forward fill for simplicity

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Example of how to predict using new data
new_data = pd.DataFrame({
    'Sentiment_Score': [0.2],
    'Open': [100],
    'High': [102],
    'Low': [98],
    'Close': [101],
    'Volume': [100000],
    'Close_pct_change': [0.01]  # Replace with an actual value
})

prediction = model.predict(new_data)[0]
print(f"Prediction: {prediction} (1: Up, 0: Down)")


  X = X.fillna(method='ffill')  # Forward fill for simplicity


Accuracy: 0.5600
Prediction: 1 (1: Up, 0: Down)


# Train Model using RandomForestRegressor

In [50]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, accuracy_score
import xgboost as xgb

# Load the dataset
# df = pd.read_csv('merged_tesla_data.csv')
df = pd.read_csv('/content/drive/MyDrive/SDP-FINAL/TSLA_dataset.csv')

# Drop the first row if it contains NaN values
df = df.dropna().reset_index(drop=True)

# Rename columns for consistency
df.rename(columns={
    'Close Price': 'Close',
    'High Price': 'High',
    'Low Price': 'Low',
    'Open Price': 'Open',
    'Trading Volume': 'Volume',
    'Avg_Tweet_Sentiment': 'Sentiment_Score',
    'Stock Movement': 'Target'
}, inplace=True)

# Feature Engineering
df['MA5'] = df['Close'].rolling(window=5).mean()
df['MA20'] = df['Close'].rolling(window=20).mean()
df.dropna(inplace=True)

# Define target variable based on close price change
df['Target'] = np.where(df['Close'].pct_change() > 0.005, 1, 0)

# Define features and target variables
features = ['Sentiment_Score', 'Open', 'High', 'Low', 'Volume', 'MA5', 'MA20']
X = df[features]
y_direction = df['Target']  # Target for direction prediction
y_price = df['Close']  # Target for price prediction

# Split data into training and testing sets
X_train_dir, X_test_dir, y_train_dir, y_test_dir = train_test_split(X, y_direction, test_size=0.1, random_state=42)
X_train_price, X_test_price, y_train_price, y_test_price = train_test_split(X, y_price, test_size=0.1, random_state=42)

# Train the direction prediction model (XGBoost Classifier)
model_direction = xgb.XGBClassifier()
model_direction.fit(X_train_dir, y_train_dir)

# Train the price prediction model (RandomForestRegressor)
model_price = RandomForestRegressor(random_state=42)
model_price.fit(X_train_price, y_train_price)

# Evaluate the models
y_pred_direction = model_direction.predict(X_test_dir)
direction_accuracy = accuracy_score(y_test_dir, y_pred_direction)
print(f"Direction prediction accuracy: {direction_accuracy:.2f}")

y_pred_price = model_price.predict(X_test_price)
price_rmse = np.sqrt(mean_squared_error(y_test_price, y_pred_price))
print(f"Price prediction RMSE: {price_rmse:.2f}")

# # Example Prediction (replace with actual new data)
new_data = X.iloc[-1].copy()
new_data['Sentiment_Score'] = 0.2
new_data = pd.DataFrame([new_data])

# Predict the stock movement for next day
price_prediction = model_price.predict(new_data)
direction_prediction = model_direction.predict(new_data)[0]

print(f"Predicted Price: {price_prediction[0]:.2f}")
print(f"Predicted Direction: {'Up' if direction_prediction == 1 else 'Down'}")

# Test on one data point (from test set)
test_index = 20  # Select first test sample
test_data = X_test_price.iloc[[test_index]]
test_actual_price = y_test_price.iloc[test_index]
test_actual_direction = y_test_dir.iloc[test_index]

test_predicted_price = model_price.predict(test_data)[0]
test_predicted_direction = model_direction.predict(test_data)[0]

print("\nTest Data Input:")
print(test_data)
print("\nActual Output:")
print(f"Actual Price: {test_actual_price:.2f}")
print(f"Actual Direction: {'Up' if test_actual_direction == 1 else 'Down'}")
print("\nPredicted Output:")
print(f"Predicted Price: {test_predicted_price:.2f}")
print(f"Predicted Direction: {'Up' if test_predicted_direction == 1 else 'Down'}")


Direction prediction accuracy: 0.54
Price prediction RMSE: 4.53
Predicted Price: 285.94
Predicted Direction: Up

Test Data Input:
     Sentiment_Score        Open        High         Low      Volume  \
127         0.097345  360.383331  364.916656  355.546661  54263100.0   

            MA5        MA20  
127  363.172668  311.557999  

Actual Output:
Actual Price: 361.53
Actual Direction: Up

Predicted Output:
Predicted Price: 361.61
Predicted Direction: Down
