<a href="https://colab.research.google.com/github/ReAlex1902/stock-trend-pred/blob/main/modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Load data

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
!cp /content/gdrive/Shareddrives/gdrive/datasets/thesis/msft_df.parquet /content
!cp -r /content/gdrive/Shareddrives/gdrive/datasets/thesis/msft-texts-daily /content

# Data preprocess

In [None]:
!pip install fastparquet

Collecting fastparquet
  Downloading fastparquet-2023.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
Collecting cramjam>=2.3 (from fastparquet)
  Downloading cramjam-2.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: cramjam, fastparquet
Successfully installed cramjam-2.7.0 fastparquet-2023.10.1


In [None]:
import pandas as pd
import numpy as np

from datetime import timedelta

import os
import spacy
import re

from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

In [None]:
msft_df = pd.read_parquet('/content/msft_df.parquet', engine = 'fastparquet')

In [None]:
msft_df = msft_df.reset_index()

In [None]:
# Добавление колонки "2-week-change"
msft_df['2-week-change'] = msft_df['Date'].apply(lambda x: msft_df[(msft_df['Date'] > x) & (msft_df['Date'] <= pd.to_datetime(x) + timedelta(days=14))]['Close'].mean())

# Изменение формата даты
msft_df['Date'] = msft_df['Date'].dt.strftime('%Y-%m-%d')

In [None]:
# Добавление колонки "percent-change" и "change-category"
msft_df['percent-change'] = ((msft_df['2-week-change'] / msft_df['Close']) - 1)*100
msft_df['change-category'] = pd.cut(msft_df['percent-change'], bins=[-float('inf'), -4, -1, 1, 4, float('inf')], labels=['<-4%', '-4% to -1%', '-1% to +1%', '+1% to +4%', '4%+'])

In [None]:
msft_df

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,2-week-change,percent-change,change-category
0,2022-01-03,329.316873,331.919192,323.847073,328.727661,28865100,0.0,0.0,309.423543,-5.872374,<-4%
1,2022-01-04,328.806246,329.169614,320.252951,323.090973,32674300,0.0,0.0,306.547340,-5.120426,<-4%
2,2022-01-05,319.997571,320.203814,310.295343,310.688141,40054300,0.0,0.0,305.123427,-1.791093,-4% to -1%
3,2022-01-06,307.516277,312.966447,305.886137,308.233154,39646100,0.0,0.0,303.783525,-1.443592,-4% to -1%
4,2022-01-07,308.498296,310.806024,304.511340,308.390289,32720000,0.0,0.0,301.818407,-2.131028,-4% to -1%
...,...,...,...,...,...,...,...,...,...,...,...
496,2023-12-22,373.679993,375.179993,372.709991,374.579987,17091100,0.0,0.0,375.012505,0.115467,-1% to +1%
497,2023-12-26,375.000000,376.940002,373.500000,374.660004,12673100,0.0,0.0,375.130005,0.125447,-1% to +1%
498,2023-12-27,373.690002,375.059998,372.809998,374.070007,14905400,0.0,0.0,375.660004,0.425053,-1% to +1%
499,2023-12-28,375.369995,376.459991,374.160004,375.279999,14327000,0.0,0.0,376.040009,0.202518,-1% to +1%


In [None]:
scaler = MinMaxScaler()
msft_df_scaled = pd.DataFrame(scaler.fit_transform(msft_df.iloc[:, 1:8]), columns = msft_df.iloc[:, 1:8].columns)
msft_df_scaled['Date'] = msft_df['Date']
msft_df_scaled['2-week-change'] = msft_df['2-week-change']
msft_df_scaled['percent-change'] = msft_df['percent-change']
msft_df_scaled['change-category'] = msft_df['change-category']

In [None]:
# Добавление столбцов с предыдущими значениями за последние 2 недели
for col in ['Open', 'High', 'Low', 'Close', 'Volume', 'Dividends']:
    for i in range(1, 15):  # 14 дней
        msft_df_scaled[f'{col}_prev_{i}d'] = msft_df_scaled[col].shift(i)

msft_df_scaled = msft_df_scaled.dropna()

In [None]:
msft_df_scaled

Unnamed: 0,Open,High,Low,Close,Volume,Dividends,Stock Splits,Date,2-week-change,percent-change,...,Dividends_prev_5d,Dividends_prev_6d,Dividends_prev_7d,Dividends_prev_8d,Dividends_prev_9d,Dividends_prev_10d,Dividends_prev_11d,Dividends_prev_12d,Dividends_prev_13d,Dividends_prev_14d
14,0.426196,0.444036,0.359593,0.463746,0.942170,0.0,0.0,2022-01-24,298.004520,2.393639,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15,0.422239,0.431529,0.413157,0.418475,0.783569,0.0,0.0,2022-01-25,299.582605,5.747494,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16,0.518095,0.511238,0.459321,0.465699,1.000000,0.0,0.0,2022-01-26,301.006516,3.306600,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17,0.487074,0.504158,0.488100,0.483681,0.545138,0.0,0.0,2022-01-27,301.255945,2.312900,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
18,0.472931,0.511120,0.467661,0.532055,0.499124,0.0,0.0,2022-01-28,299.957730,-0.910606,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,0.933621,0.940580,0.951436,0.946411,0.104732,0.0,0.0,2023-12-21,374.926001,0.371043,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
496,0.940259,0.945206,0.967404,0.952495,0.097138,0.0,0.0,2023-12-22,375.012505,0.115467,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
497,0.948082,0.955780,0.972129,0.952963,0.042748,0.0,0.0,2023-12-26,375.130005,0.125447,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
498,0.940318,0.944485,0.968002,0.949511,0.070229,0.0,0.0,2023-12-27,375.660004,0.425053,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
msft_df_scaled.to_parquet('msft_df_scaled.parquet', engine='fastparquet')

In [None]:
!cp /content/msft_df_scaled.parquet /content/gdrive/Shareddrives/gdrive/datasets/thesis

In [None]:
# # Load spaCy English model
# nlp = spacy.load("en_core_web_sm")

# # Function to clean and preprocess the text using spaCy
# def preprocess_text_spacy(text):
#     # Remove special characters, numbers, and extra whitespaces
#     text = re.sub(r'[^a-zA-Z\s]', '', text)
#     text = re.sub(r'\s+', ' ', text).strip()

#     # Process the text using spaCy
#     doc = nlp(text)

#     # Lemmatize and remove stopwords
#     tokens = [token.lemma_ for token in doc if not token.is_stop]

#     # Join the processed tokens back into a single string
#     clean_text = ' '.join(tokens)

#     return clean_text

# # Folder with text files
# folder_path = "msft-texts-daily"

# # List to store data before creating the dataframe
# data_list = []

# # Iterate over files in the directory
# for filename in os.listdir(folder_path):
#     if filename.endswith(".txt"):
#         file_path = os.path.join(folder_path, filename)

#         # Extract date from the filename
#         date_str = filename[:-4]  # Remove ".txt"

#         # Open the file and read the text
#         with open(file_path, 'r', encoding='utf-8') as file:
#             text = file.read()

#         # Clean and preprocess the text using spaCy
#         clean_text = preprocess_text_spacy(text)

#         # Add data to the list
#         data_list.append({'Date': date_str, 'Text': clean_text})

# # Create a dataframe
# df = pd.DataFrame(data_list)

# # Apply TF-IDF vectorization
# vectorizer = TfidfVectorizer()
# tfidf_matrix = vectorizer.fit_transform(df['Text'])

# # Create a dataframe with TF-IDF values
# tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())
# result_df = pd.concat([df[['Date']], tfidf_df], axis=1)

# result_df

Unnamed: 0,Date,aa,aaa,aaaa,aaaaaand,aaai,aaaiiiee,aaaquality,aaarate,aaasize,...,zwiezen,zwischenschritt,zword,zygna,zynamics,zynex,zynga,zyxel,zyxi,zz
0,2023-06-15,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.005086,0.0,0.0,0.0,0.0,0.0,0.0000,0.00000,0.0,0.0
1,2023-05-14,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0000,0.00000,0.0,0.0
2,2023-09-30,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0000,0.00000,0.0,0.0
3,2023-04-08,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0000,0.00000,0.0,0.0
4,2023-06-18,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0000,0.00000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
390,2023-08-18,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0000,0.00000,0.0,0.0
391,2023-06-14,0.003222,0.006668,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0196,0.00386,0.0,0.0
392,2022-10-18,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0000,0.00000,0.0,0.0
393,2023-04-19,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0000,0.00000,0.0,0.0


In [None]:
# result_df.to_parquet('msft_df_scaled_tfidf.parquet', engine='fastparquet')

In [None]:
# !cp /content/msft_df_scaled_tfidf.parquet /content/gdrive/Shareddrives/gdrive/datasets/thesis

# Random Forest

## No news

In [None]:
# Предположим, что у вас уже есть датафрейм msft_df с колонкой 'change-category'
# Замените msft_df на ваш существующий датафрейм

# Убираем колонки, которые не должны использоваться для обучения
features_to_exclude = ['Date', '2-week-change', 'percent-change', 'change-category']
features = [col for col in msft_df.columns if col not in features_to_exclude]

# Выбор признаков и целевой переменной
X = msft_df[features]
y = msft_df['change-category']

# Преобразование текстовых значений в числовые для целевой переменной
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Разделение данных на тренировочный и тестовый наборы
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Инициализация RandomForestClassifier
rf_classifier = RandomForestClassifier(
                    n_estimators = 10000,
                    criterion = 'entropy',
                    max_depth = 12,
                    random_state=11)

# Обучение модели
rf_classifier.fit(X_train, y_train)

# Предсказание на тестовом наборе
y_pred = rf_classifier.predict(X_test)

# Оценка точности модели
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Вывод отчета по классификации
class_report = classification_report(y_test, y_pred, target_names=label_encoder.classes_[:-1])
print(f'Classification Report:\n{class_report}')

Accuracy: 0.37
Classification Report:
              precision    recall  f1-score   support

  +1% to +4%       0.33      0.44      0.37        32
  -1% to +1%       0.46      0.36      0.41        33
  -4% to -1%       0.35      0.38      0.36        16
         4%+       0.25      0.29      0.27         7
        <-4%       0.43      0.23      0.30        13

    accuracy                           0.37       101
   macro avg       0.36      0.34      0.34       101
weighted avg       0.38      0.37      0.37       101



## With news

In [None]:
full_msft_df = msft_df_scaled.merge(result_df, how='inner', on='Date')

In [None]:
# Предположим, что у вас уже есть датафрейм full_msft_df с колонкой 'change-category'
# Замените full_msft_df на ваш существующий датафрейм

# Убираем колонки, которые не должны использоваться для обучения
features_to_exclude = ['Date', '2-week-change', 'percent-change', 'change-category']
features = [col for col in full_msft_df.columns if col not in features_to_exclude]

# Выбор признаков и целевой переменной
X = full_msft_df[features]
y = full_msft_df['change-category']

# Преобразование текстовых значений в числовые для целевой переменной
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Разделение данных на тренировочный и тестовый наборы
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=11)

# Инициализация RandomForestClassifier
rf_classifier = RandomForestClassifier(
                    n_estimators = 10000,
                    criterion = 'entropy',
                    max_depth = 12,
                    random_state=11)

# Обучение модели
rf_classifier.fit(X_train, y_train)

# Предсказание на тестовом наборе
y_pred = rf_classifier.predict(X_test)

# Оценка точности модели
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Вывод отчета по классификации
class_report = classification_report(y_test, y_pred, target_names=label_encoder.classes_)
print(f'Classification Report:\n{class_report}')

Accuracy: 0.44
Classification Report:
              precision    recall  f1-score   support

  +1% to +4%       0.52      0.68      0.59        19
  -1% to +1%       0.38      0.71      0.50        14
  -4% to -1%       0.00      0.00      0.00         9
         4%+       0.25      0.10      0.14        10
        <-4%       0.00      0.00      0.00         3

    accuracy                           0.44        55
   macro avg       0.23      0.30      0.25        55
weighted avg       0.32      0.44      0.36        55



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
