<a href="https://colab.research.google.com/github/Rinniedh/Diane-Hoang-Sentiment-Analysis-using-Linear-Regression/blob/main/Diane_Hoang_Sentiment_Analysis_using_Linear_Regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd

# Load the Excel file
file_path = '/Project 02 - Data.xlsx'
xls = pd.ExcelFile(file_path)

# Load the numeric data
df_numeric = pd.read_excel(xls, sheet_name='Numeric Data')

# Load the tweet data
df_tweets = pd.read_excel(xls, sheet_name='Tweets')

# Display the first few rows of each dataframe to understand their structure
print("Numeric Data:")
print(df_numeric.head())
print("\nTweets Data:")
print(df_tweets.head())


Numeric Data:
   day  open_price  high_price  low_price  moving_average_5_day  \
0    1       44.55       44.83      44.32                 44.36   
1    2       44.89       45.12      44.17                 44.37   
2    3       45.04       45.35      44.84                 44.57   
3    4       45.02       45.15      44.70                 44.65   
4    5       44.75       45.04      44.65                 44.76   

   moving_average_10_day  moving_average_50_day  moving_average_200_day  \
0                  43.82                  40.23                   42.25   
1                  44.06                  40.32                   42.27   
2                  44.21                  40.42                   42.27   
3                  44.40                  40.50                   42.28   
4                  44.58                  40.59                   42.28   

    volume  next_day_close_price  
0  1500700                 44.83  
1  2391800                 45.03  
2  1723400                 

In [2]:
##Step 3: Sentiment Analysis

from textblob import TextBlob

# Function to compute sentiment scores
def get_sentiment(tweet):
    # Ensure the tweet is a string
    if not isinstance(tweet, str):
        return 0.0, 0.0
    analysis = TextBlob(tweet)
    return analysis.sentiment.polarity, analysis.sentiment.subjectivity

# Apply sentiment analysis to each tweet
df_tweets['polarity'], df_tweets['subjectivity'] = zip(*df_tweets['tweet'].apply(get_sentiment))

# Aggregate sentiment scores by day
daily_sentiment = df_tweets.groupby('day').agg({'polarity': 'mean', 'subjectivity': 'mean'}).reset_index()
print(daily_sentiment.head())



   day  polarity  subjectivity
0    1  0.069925      0.180038
1    2  0.036106      0.244376
2    3  0.000953      0.244024
3    4 -0.013889      0.188889
4    5  0.064073      0.225643


In [3]:
##Step 4: Combine Datasets

# Combine the numeric data with the aggregated sentiment scores
df_combined = pd.merge(df_numeric, daily_sentiment, on='day', how='left')

# Display the combined dataset
print(df_combined.head())


   day  open_price  high_price  low_price  moving_average_5_day  \
0    1       44.55       44.83      44.32                 44.36   
1    2       44.89       45.12      44.17                 44.37   
2    3       45.04       45.35      44.84                 44.57   
3    4       45.02       45.15      44.70                 44.65   
4    5       44.75       45.04      44.65                 44.76   

   moving_average_10_day  moving_average_50_day  moving_average_200_day  \
0                  43.82                  40.23                   42.25   
1                  44.06                  40.32                   42.27   
2                  44.21                  40.42                   42.27   
3                  44.40                  40.50                   42.28   
4                  44.58                  40.59                   42.28   

    volume  next_day_close_price  polarity  subjectivity  
0  1500700                 44.83  0.069925      0.180038  
1  2391800                 4

In [4]:
## Step 5: Feature Engineering and Data Cleaning

# Example: Fill missing sentiment values with zero
df_combined['polarity'].fillna(0, inplace=True)
df_combined['subjectivity'].fillna(0, inplace=True)

# Additional feature engineering or data cleaning tasks can be added here

# Display the cleaned dataset
print(df_combined.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1300 entries, 0 to 1299
Data columns (total 12 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   day                     1300 non-null   int64  
 1   open_price              1300 non-null   float64
 2   high_price              1300 non-null   float64
 3   low_price               1300 non-null   float64
 4   moving_average_5_day    1300 non-null   float64
 5   moving_average_10_day   1300 non-null   float64
 6   moving_average_50_day   1300 non-null   float64
 7   moving_average_200_day  1300 non-null   float64
 8   volume                  1300 non-null   int64  
 9   next_day_close_price    1000 non-null   float64
 10  polarity                1300 non-null   float64
 11  subjectivity            1300 non-null   float64
dtypes: float64(10), int64(2)
memory usage: 122.0 KB
None


In [5]:
## Step 6: Split Data
# Create a dataframe containing only those rows for which predictions are needed
df_predict = df_combined[pd.isnull(df_combined.next_day_close_price)].copy()

# Remove all incomplete rows from the main dataframe
df_train = df_combined[pd.isnull(df_combined.next_day_close_price) == False].copy()

# Display the shapes of the resulting dataframes
print("Training Data Shape:", df_train.shape)
print("Prediction Data Shape:", df_predict.shape)


Training Data Shape: (1000, 12)
Prediction Data Shape: (300, 12)


In [6]:
from sklearn.model_selection import train_test_split

# Splitting df_train into features and target variable
X = df_train.drop(columns=['next_day_close_price'])
y = df_train['next_day_close_price']

# Splitting the data into 70% training and 30% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Displaying the shapes of the resulting datasets
print("Training Data Shape:", X_train.shape, y_train.shape)
print("Test Data Shape:", X_test.shape, y_test.shape)


Training Data Shape: (750, 11) (750,)
Test Data Shape: (250, 11) (250,)


In [7]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# Step 7: Model Training

# Initialize a Linear Regression model
linear_model = LinearRegression()

# Fit the model to the training data
linear_model.fit(X_train, y_train)

# Validate the model
y_pred_linear = linear_model.predict(X_test)

# Calculate evaluation metrics
mse_linear = mean_squared_error(y_test, y_pred_linear)
print(f'Linear Regression Mean Squared Error: {mse_linear}')

# Calculate R-squared on the test set
r2_linear = r2_score(y_test, y_pred_linear)
print(f'Linear Regression R-squared: {r2_linear}')

# Calculate MAE
mae_linear = mean_absolute_error(y_test, y_pred_linear)
print(f'Linear Regression Mean Absolute Error: {mae_linear}')


Linear Regression Mean Squared Error: 0.3202138182230296
Linear Regression R-squared: 0.9968546368338088
Linear Regression Mean Absolute Error: 0.3761173624711928


In [8]:
# Step 8: Making Predictions

# Assuming df_predict contains the data for which predictions need to be made

# Make predictions using the trained Linear Regression model on df_predict
y_pred_linear_predict = linear_model.predict(df_predict.drop(columns=['next_day_close_price']))

# Display the predictions for df_predict
predictions_df_predict = pd.DataFrame({'day': df_predict['day'], 'next_day_close_price_predicted': y_pred_linear_predict})
print(predictions_df_predict)


       day  next_day_close_price_predicted
1000  1001                       56.441725
1001  1002                       58.836219
1002  1003                       58.731218
1003  1004                       58.054063
1004  1005                       58.682448
...    ...                             ...
1295  1296                      114.269896
1296  1297                      116.624407
1297  1298                      116.519459
1298  1299                      117.556166
1299  1300                      117.368751

[300 rows x 2 columns]


In [9]:
## Display the results
predictions_df_predict

Unnamed: 0,day,next_day_close_price_predicted
1000,1001,56.441725
1001,1002,58.836219
1002,1003,58.731218
1003,1004,58.054063
1004,1005,58.682448
...,...,...
1295,1296,114.269896
1296,1297,116.624407
1297,1298,116.519459
1298,1299,117.556166


In [10]:
## Step 9 Cross Validation

from sklearn.model_selection import cross_val_score
import numpy as np

# Perform cross-validation
cv_scores = cross_val_score(linear_model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')

# Convert the negative MSE scores to positive and calculate RMSE
rmse_scores = np.sqrt(-cv_scores)

# Calculate mean RMSE across all folds
mean_rmse = np.mean(rmse_scores)

print("Cross-Validation RMSE Scores:", rmse_scores)
print("Mean RMSE:", mean_rmse)


Cross-Validation RMSE Scores: [0.86359143 0.53147794 0.76808    0.89866726 0.76497371]
Mean RMSE: 0.7653580677344519


In [18]:
# Save predictions to a CSV file in the Colab environment
predictions_df_predict[['day', 'next_day_close_price_predicted']].to_csv('Hoang_Diane.csv', index=False)

# Download the CSV file from Colab
from google.colab import files
files.download('Hoang_Diane.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>