In [14]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import nltk
from sklearn.utils import shuffle
import string
import re
import os
from tqdm import tqdm
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin

In [13]:
!pip install nltk

Collecting nltk
  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting regex>=2021.8.3 (from nltk)
  Downloading regex-2024.9.11-cp39-cp39-win_amd64.whl.metadata (41 kB)
     ---------------------------------------- 0.0/41.5 kB ? eta -:--:--
     ---------------------------------------- 41.5/41.5 kB 2.0 MB/s eta 0:00:00
Downloading nltk-3.9.1-py3-none-any.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   --- ------------------------------------ 0.1/1.5 MB 4.2 MB/s eta 0:00:01
   ------- -------------------------------- 0.3/1.5 MB 3.5 MB/s eta 0:00:01
   ------------- -------------------------- 0.5/1.5 MB 3.9 MB/s eta 0:00:01
   ------------------ --------------------- 0.7/1.5 MB 4.0 MB/s eta 0:00:01
   ------------------------ --------------- 0.9/1.5 MB 4.1 MB/s eta 0:00:01
   ----------------------------- ---------- 1.1/1.5 MB 4.2 MB/s eta 0:00:01
   ------------------------------------- -- 1.4/1.5 MB 4.2 MB/s eta 0:00:01
   ---------

In [300]:
!pip install dask[dataframe]



In [301]:
!pip install --upgrade pandas



### Fetchine Data

In [15]:
data_dir = os.path.join("..", "Data", "sbi_csv")

list_files = os.listdir(data_dir)

csv_files = [file for file in list_files if file.endswith('.csv')]

data_frames = {}
for file in csv_files:
    data_frames[file[:-4]] = pd.read_csv(os.path.join(data_dir, file))

for key in data_frames:
    print(key)

entities
entities_similar
highlights
highlights_similar
metadata
metadata_similar


In [16]:
entities = data_frames['entities']
entities_similar = data_frames['entities_similar']
highlights = data_frames['highlights']
highlights_similar = data_frames['highlights_similar']
metadata = data_frames['metadata']
metadata_similar = data_frames['metadata_similar']

In [17]:
metadata.drop(columns = ['Unnamed: 0'], inplace = True)

In [18]:
metadata.head()

Unnamed: 0,published_at,uuid,title,description,snippet,url,image_url,language,source,relevance_score
0,2024-09-18 12:45:04+00:00,bb5429b2-be36-4088-b04c-9f17e9003d2d,Banking system low-cost deposits could decline...,"SBI’s Casa ratio stood at 40.7% as on 30 June,...",Mumbai: The share of low-cost deposits in the ...,https://www.livemint.com/industry/banking/bank...,https://www.livemint.com/lm-img/img/2024/09/18...,en,livemint.com,
1,2024-09-18 12:37:42+00:00,0b24d613-2705-4283-b45e-3b8942fc5762,"State Bank of India raises Rs 7,500 cr at 7.33...",Tier-2 bond issuance: This issuance follows a ...,"The State Bank of India (SBI) raised Rs 7,500 ...",https://www.business-standard.com/companies/ne...,https://bsmedia.business-standard.com/_media/b...,en,business-standard.com,
2,2024-09-18 11:56:10+00:00,9bcbd8e3-7f97-4d00-aae8-1f08f1a674fd,"SBI raises ₹15,000 cr via Basel III bonds in a...","In less than a month, State Bank of India rais...",Shares of State Bank of India (SBI) ended 1% h...,https://www.fortuneindia.com/investing/sbi-rai...,https://media.assettype.com/fortuneindia/2022-...,en,fortuneindia.com,
3,2024-09-18 11:36:14+00:00,0d05c075-706d-4a5e-920c-ae39a41b420b,Stock Market Sectors: Stock market update: Min...,The 30-share BSE Sensex closed down 131.43 p...,\n\n\n\n(You can now subscribe to our\n\n(You ...,https://economictimes.indiatimes.com/markets/s...,"https://img.etimg.com/thumb/msid-82697732,widt...",en,economictimes.indiatimes.com,
4,2024-09-18 11:27:31+00:00,6bf48528-aaf6-4c9d-93b8-303c23033b91,"Banks’ CASA ratio may fall further as cos, gov...",SBI Chairman discusses impact of enhanced cash...,The enhanced cash management efficiency of gov...,https://www.thehindubusinessline.com/money-and...,https://bl-i.thgim.com/public/incoming/em0t96/...,en,thehindubusinessline.com,


In [19]:
metadata.shape

(12579, 10)

In [20]:
metadata['uuid'].nunique()

12579

### 'metadata' dataset contains 'uuid' for those articles which are from 'State Bank of India'

In [21]:
entities['name'].value_counts()

name
State Bank of India    12579
Name: count, dtype: int64

### meta_similar_unique contains those samples whose 'uuid's are not present in the 'metadata' dataset
### Taking out the data for those 'uuid's which are not present in 'metadata' 

In [22]:
meta_similar_unique = metadata_similar[metadata_similar['uuid'].isin(metadata['uuid']) == False].reset_index().drop(columns = ['index', 'Unnamed: 0'])

In [23]:
meta_similar_unique

Unnamed: 0,published_at,uuid,title,description,snippet,url,image_url,language,source,relevance_score
0,2024-09-18T12:02:18.000000Z,e75cdefa-a7e0-46ba-8a0c-d2574e7bfc8f,Rise In Unsecured Small Ticket Loans Not Alarm...,The rise in small value unsecured loans with a...,The rise in small-value unsecured loans with a...,https://www.ndtvprofit.com/economy-finance/sbi...,https://media.assettype.com/bloombergquint%2F2...,en,bloombergquint.com,
1,2024-09-18T12:02:18.000000Z,7126ca9f-90cf-4a03-b6e9-5ee6d0a0ec9c,"SBI Raises Rs 7,500 Crore Via Second Tranche O...","State Bank of India bagged Rs 7,500 crore thro...","State Bank of India bagged Rs 7,500 crore thro...",https://www.ndtvprofit.com/business/sbi-raises...,https://media.assettype.com/bloombergquint%2F2...,en,bloombergquint.com,
2,2024-09-18T11:26:00.000000Z,78644d24-249e-40a2-8151-5320d71eee4f,Stock Market Sectors: Stock market update: Sug...,The 30-share BSE Sensex closed down 131.43 p...,\n\n\n\n(You can now subscribe to our\n\n(You ...,https://economictimes.indiatimes.com/markets/s...,"https://img.etimg.com/thumb/msid-83515267,widt...",en,economictimes.indiatimes.com,
3,2024-09-15T05:40:15.000000Z,986ea556-7b6a-458c-86ad-e7c1b9687a60,Mcap: Nine of top-10 most valued firms jump ₹2...,Reliance Industries led the chart of the top-1...,Nine of the top-10 most valued firms together ...,https://www.thehindubusinessline.com/markets/m...,https://bl-i.thgim.com/public/incoming/4vvb0i/...,en,thehindubusinessline.com,
4,2024-09-13T10:50:30.000000Z,2b0ceca0-0641-4c05-a24f-a989dddc8c6e,Stock Market Sectors: Stock market update: FMC...,The 30-share BSE Sensex closed down 71.77 po...,\n\n\n\n(You can now subscribe to our\n\n(You ...,https://economictimes.indiatimes.com/markets/s...,"https://img.etimg.com/thumb/msid-83800143,widt...",en,economictimes.indiatimes.com,
...,...,...,...,...,...,...,...,...,...,...
728,2022-09-14T11:41:45.000000Z,0a6f3830-86a7-4911-b785-a538f5d394d1,Stock Market Sectors: Stock market update: Fer...,The 30-share BSE Sensex closed down 224.11 p...,NEW DELHI: Fertilisers shares closed lower in ...,https://economictimes.indiatimes.com/markets/s...,"https://img.etimg.com/thumb/msid-83209716,widt...",en,economictimes.indiatimes.com,
729,2022-09-14T08:19:07.000000Z,af86695f-0983-40be-b4b9-eba62c116c75,SBI becomes third bank to cross Rs 5 lakh cror...,SBI market cap: Market watchers believe that S...,Country’s biggest lender by assets State Bank ...,https://www.businesstoday.in/markets/story/sbi...,https://akm-img-a-in.tosshub.com/businesstoday...,en,businesstoday.in,
730,2022-09-14T04:49:31.000000Z,33d7d348-9c03-4c28-95fc-ee956a9f4f5b,Stock Market Sectors: Stock market update: Min...,The 30-share BSE Sensex was down 566.16 poin...,NEW DELHI: Mining stocks were trading lower on...,https://economictimes.indiatimes.com/markets/s...,"https://img.etimg.com/thumb/msid-83608600,widt...",en,economictimes.indiatimes.com,
731,2022-09-13T15:09:41.000000Z,fe80ab58-39cc-41df-8425-3f15a2db4e42,BharatPe appoints two new independent directors,The two appointed independent directors are BP...,After months-long tussle between Ashneer Grove...,https://www.thehindubusinessline.com/companies...,https://bl-i.thgim.com/public/incoming/une67m/...,en,thehindubusinessline.com,


#### time correction

In [24]:
def correct_time(data):
    data['published_at'] = data['published_at'].apply(lambda x: x[:19])
    data['published_at'] = pd.to_datetime(data['published_at'])

In [25]:
correct_time(meta_similar_unique)
correct_time(metadata)

In [26]:
meta_similar_unique['published_at']

0     2024-09-18 12:02:18
1     2024-09-18 12:02:18
2     2024-09-18 11:26:00
3     2024-09-15 05:40:15
4     2024-09-13 10:50:30
              ...        
728   2022-09-14 11:41:45
729   2022-09-14 08:19:07
730   2022-09-14 04:49:31
731   2022-09-13 15:09:41
732   2021-02-20 11:38:00
Name: published_at, Length: 733, dtype: datetime64[ns]

#### Data for State Bank of India in 'entities_similar'
keeping only those samples which have name = 'State Bank of India'

In [27]:
entities_similar['name'].value_counts()

name
State Bank of India               5657
Bank of India Limited             1745
Tata Motors Limited               1041
HDFC Bank Limited                  739
NTPC Limited                       718
                                  ... 
Solar Industries India Limited       1
GRM Overseas Limited                 1
Poonawalla Fincorp Limited           1
Suprajit Engineering Limited         1
STATE BANK OF INDIA                  1
Name: count, Length: 586, dtype: int64

In [28]:
entities_similar = entities_similar[entities_similar['name'] == 'State Bank of India'].reset_index().drop(columns = ['index', 'Unnamed: 0'])

In [29]:
entities_similar.shape

(5657, 10)

In [30]:
entities_similar['uuid'].nunique()

1887

Out of 5657, 2197 uuid in entities_similar are not present in metadata['uuid']

In [31]:
sum(entities_similar['uuid'].isin(metadata['uuid']) == False)

2197

In [32]:
sum(entities_similar['uuid'].isin(metadata['uuid']) == True)

3460

### from 'entities_similar' taking out those 'uuid' which are present in 'entities_similar' but not present in 'metadata'

In [33]:
uuidinsbi_entities_similar = entities_similar[entities_similar['uuid'].isin(metadata['uuid']) == False]

In [34]:
uuidinsbi_entities_similar.head(3)

Unnamed: 0,symbol,name,exchange,exchange_long,country,type,industry,match_score,sentiment_score,uuid
0,SBKFF,State Bank of India,,,us,equity,Financial Services,31.545517,-0.4452,e75cdefa-a7e0-46ba-8a0c-d2574e7bfc8f
1,SBIN.NS,State Bank of India,,,in,equity,Financial Services,31.545378,-0.4452,e75cdefa-a7e0-46ba-8a0c-d2574e7bfc8f
2,SBIN.BO,State Bank of India,,,in,equity,Financial Services,31.544556,-0.4452,e75cdefa-a7e0-46ba-8a0c-d2574e7bfc8f


### All the 'uuid's which are present in 'entities_similar' and 'name' = 'State Bank of India' are same as the 'uuid's which we got from 'meta_similar_unique'

In [35]:
sum(uuidinsbi_entities_similar['uuid'].isin(meta_similar_unique['uuid']) == False)

0

In [36]:
highlights = highlights.drop(columns = ['Unnamed: 0'])

### Checking for duplicates 

In [37]:
highlights.iloc[7584, :]

highlight         Ltd., Citigroup Inc., Finbud Financial Service...
sentiment                                                    0.4019
highlighted_in                                            main_text
uuid                           aa628d63-04b6-406f-8bfd-02e68114aa1f
Name: 7584, dtype: object

In [38]:
highlights[highlights.duplicated() == True]

Unnamed: 0,highlight,sentiment,highlighted_in,uuid
5355,"A few years ago, when five associate banks of ...",0.0,main_text,e73c1d80-e502-4bac-a8f1-962f92c11bc6
7585,"Ltd., Citigroup Inc., Finbud Financial Service...",0.4019,main_text,aa628d63-04b6-406f-8bfd-02e68114aa1f
7981,"For instance, <em>State</em> <em>Bank</em> <em...",0.6908,main_text,4a572cff-8615-4e26-a492-8a1531926b88
12091,the Securitization and Reconstruction of Finan...,0.872,main_text,3107e093-b3b4-40bd-ad28-3218d13949b7
12092,<em>Bank</em> <em>of</em> <em>India</em>) as s...,0.6369,main_text,3107e093-b3b4-40bd-ad28-3218d13949b7
12093,"For detailed terms and conditions of the sale,...",0.6124,main_text,3107e093-b3b4-40bd-ad28-3218d13949b7


In [39]:
highlights.drop_duplicates(inplace = True)

In [42]:
print(highlights['highlight'][7584])

Ltd., Citigroup Inc., Finbud Financial Services Pvt Ltd, HDFC Bank Ltd., HSBC Holdings Plc, ICICI Bank Ltd., IDBI Bank Ltd., Indian Overseas Bank, Mahindra and Mahindra Financial Services Ltd., PPF Group, Punjab National Bank, <em>State</em> <em>Bank</em> <em>of</em> <em>India</em>, Tata Sons Pvt. Ltd., UCO BANK, Union Bank of India, Whizdm Innovations Pvt. Ltd., and Yes Bank Ltd.


### Taking the 'title' from 'metadata' dataset with it sentiment score from the 'entities' dataset

In [43]:
metadata['published_at']

0       2024-09-18 12:45:04
1       2024-09-18 12:37:42
2       2024-09-18 11:56:10
3       2024-09-18 11:36:14
4       2024-09-18 11:27:31
                ...        
12574   2021-01-01 14:55:00
12575   2021-01-01 11:00:40
12576   2021-01-01 10:45:00
12577   2021-01-01 07:40:29
12578   2021-01-01 04:59:03
Name: published_at, Length: 12579, dtype: datetime64[ns]

In [44]:
temp_df = metadata[['published_at', 'uuid', 'title', 'description']]
# temp_df = metadata[['uuid', 'title', 'description']] # originally(for your need)

In [45]:
temp_df.shape

(12579, 4)

In [46]:
temp_df.head(2)

Unnamed: 0,published_at,uuid,title,description
0,2024-09-18 12:45:04,bb5429b2-be36-4088-b04c-9f17e9003d2d,Banking system low-cost deposits could decline...,"SBI’s Casa ratio stood at 40.7% as on 30 June,..."
1,2024-09-18 12:37:42,0b24d613-2705-4283-b45e-3b8942fc5762,"State Bank of India raises Rs 7,500 cr at 7.33...",Tier-2 bond issuance: This issuance follows a ...


In [47]:
# temp_df.merge(entities[['uuid', 'sentiment_score']], how = 'left', on = 'uuid') #originally
temp_df.merge(entities[['uuid', 'sentiment_score']], how = 'left', on = 'uuid')

Unnamed: 0,published_at,uuid,title,description,sentiment_score
0,2024-09-18 12:45:04,bb5429b2-be36-4088-b04c-9f17e9003d2d,Banking system low-cost deposits could decline...,"SBI’s Casa ratio stood at 40.7% as on 30 June,...",0.612400
1,2024-09-18 12:37:42,0b24d613-2705-4283-b45e-3b8942fc5762,"State Bank of India raises Rs 7,500 cr at 7.33...",Tier-2 bond issuance: This issuance follows a ...,0.000000
2,2024-09-18 11:56:10,9bcbd8e3-7f97-4d00-aae8-1f08f1a674fd,"SBI raises ₹15,000 cr via Basel III bonds in a...","In less than a month, State Bank of India rais...",0.296000
3,2024-09-18 11:36:14,0d05c075-706d-4a5e-920c-ae39a41b420b,Stock Market Sectors: Stock market update: Min...,The 30-share BSE Sensex closed down 131.43 p...,0.771700
4,2024-09-18 11:27:31,6bf48528-aaf6-4c9d-93b8-303c23033b91,"Banks’ CASA ratio may fall further as cos, gov...",SBI Chairman discusses impact of enhanced cash...,0.361200
...,...,...,...,...,...
12574,2021-01-01 14:55:00,acf77079-622e-471d-9dc9-7fb9eace9311,"CBI books Shakti Bhog Foods in alleged Rs 3,26...",Read more about CBI books Shakti Bhog Foods in...,-0.630733
12575,2021-01-01 11:00:40,e833cd42-e31b-4d5f-9790-24f6046cbb47,Sensex Gains For Eighth Straight Day; Nifty En...,Eight of 11 sector gauges compiled by the Nati...,0.007800
12576,2021-01-01 10:45:00,b26b2c4f-7831-498f-92b2-462b7083df58,"Sensex begins 2021 with gains, Nifty closes ab...",The first trading session of 2021 was not much...,0.202300
12577,2021-01-01 07:40:29,b9619902-1d57-4a0d-9da6-75138ff6e4c9,Stock Markets Will Remain Closed On These Days...,"In calendar year 2020, the Sensex rose 15.75 p...",0.784500


In [48]:
temp_df = temp_df.merge(entities[['uuid', 'sentiment_score']], how = 'left', on = 'uuid')

In [49]:
highlights.head(2)

Unnamed: 0,highlight,sentiment,highlighted_in,uuid
0,Mumbai: The share of low-cost deposits in the ...,0.6124,main_text,bb5429b2-be36-4088-b04c-9f17e9003d2d
1,The <em>State</em> <em>Bank</em> <em>of</em> <...,0.0,main_text,0b24d613-2705-4283-b45e-3b8942fc5762


In [50]:
highlights.groupby('uuid').agg({
    'highlight' : " ".join
}).reset_index()

Unnamed: 0,uuid,highlight
0,00100e6b-81d5-4af3-92c2-e792778f9d1a,"HDFC Bank, <em>State</em> <em>Bank</em> <em>of..."
1,0014fafb-638c-42bb-9649-6c5e37f50255,While competitive home loan interest rates und...
2,0016b2bf-0c23-43a7-a354-19434d316124,Several large banks such as <em>State</em> <em...
3,0018ff17-343c-4b65-ab37-f054ec025c47,"professor.Following directives from Chintu, Pi..."
4,001bd1a1-a4b4-4e33-891e-548b6e9aa73f,<em>State</em> <em>Bank</em> <em>of</em> <em>I...
...,...,...
12574,ffde1b9c-b498-4e60-b715-d83e360734ea,Axis Securities likes stocks such as ICICI Ban...
12575,ffe80339-c14d-4145-87ff-bb5a75f88275,firms suffered erosion from their market capit...
12576,ffe939f1-a6be-484b-90a3-7c01b543f31d,"Dinesh Kumar Khara, Chairman, <em>State</em> <..."
12577,fff550a2-dfed-44c3-80be-187d3b381178,New Delhi: <em>State</em> <em>Bank</em> <em>of...


In [51]:
temp_highlights = highlights.groupby('uuid').agg({
    'highlight' : " ".join
}).reset_index()

### merging 'temp_df' and 'temp_highlights' on 'uuid' field

In [52]:
temp_df.merge(temp_highlights, on = 'uuid').head(3)

Unnamed: 0,published_at,uuid,title,description,sentiment_score,highlight
0,2024-09-18 12:45:04,bb5429b2-be36-4088-b04c-9f17e9003d2d,Banking system low-cost deposits could decline...,"SBI’s Casa ratio stood at 40.7% as on 30 June,...",0.6124,Mumbai: The share of low-cost deposits in the ...
1,2024-09-18 12:37:42,0b24d613-2705-4283-b45e-3b8942fc5762,"State Bank of India raises Rs 7,500 cr at 7.33...",Tier-2 bond issuance: This issuance follows a ...,0.0,The <em>State</em> <em>Bank</em> <em>of</em> <...
2,2024-09-18 11:56:10,9bcbd8e3-7f97-4d00-aae8-1f08f1a674fd,"SBI raises ₹15,000 cr via Basel III bonds in a...","In less than a month, State Bank of India rais...",0.296,Shares of <em>State</em> <em>Bank</em> <em>of<...


In [53]:
df = temp_df.merge(temp_highlights, on = 'uuid')

In [54]:
df['article'] = df['title'] +' '+ df['description'] +' '+ df['highlight']

In [55]:
df.drop(columns = ['title', 'description', 'highlight'], inplace = True)

### Dataset ready for normal data

In [56]:
df.head(3)

Unnamed: 0,published_at,uuid,sentiment_score,article
0,2024-09-18 12:45:04,bb5429b2-be36-4088-b04c-9f17e9003d2d,0.6124,Banking system low-cost deposits could decline...
1,2024-09-18 12:37:42,0b24d613-2705-4283-b45e-3b8942fc5762,0.0,"State Bank of India raises Rs 7,500 cr at 7.33..."
2,2024-09-18 11:56:10,9bcbd8e3-7f97-4d00-aae8-1f08f1a674fd,0.296,"SBI raises ₹15,000 cr via Basel III bonds in a..."


#### aakash

In [57]:
# df_sorted = df.sort_values(by = 'published_at', ascending = True).reset_index(drop = True)

In [58]:
# df_sorted

In [59]:
# df_sorted.to_csv('dataset.csv', index = False)

### doing the same thing for datasets under 'similar'

In [60]:
meta_similar_unique.head(2)

Unnamed: 0,published_at,uuid,title,description,snippet,url,image_url,language,source,relevance_score
0,2024-09-18 12:02:18,e75cdefa-a7e0-46ba-8a0c-d2574e7bfc8f,Rise In Unsecured Small Ticket Loans Not Alarm...,The rise in small value unsecured loans with a...,The rise in small-value unsecured loans with a...,https://www.ndtvprofit.com/economy-finance/sbi...,https://media.assettype.com/bloombergquint%2F2...,en,bloombergquint.com,
1,2024-09-18 12:02:18,7126ca9f-90cf-4a03-b6e9-5ee6d0a0ec9c,"SBI Raises Rs 7,500 Crore Via Second Tranche O...","State Bank of India bagged Rs 7,500 crore thro...","State Bank of India bagged Rs 7,500 crore thro...",https://www.ndtvprofit.com/business/sbi-raises...,https://media.assettype.com/bloombergquint%2F2...,en,bloombergquint.com,


In [61]:
meta_similar_unique.shape

(733, 10)

In [62]:
uuidinsbi_entities_similar.head(5)

Unnamed: 0,symbol,name,exchange,exchange_long,country,type,industry,match_score,sentiment_score,uuid
0,SBKFF,State Bank of India,,,us,equity,Financial Services,31.545517,-0.4452,e75cdefa-a7e0-46ba-8a0c-d2574e7bfc8f
1,SBIN.NS,State Bank of India,,,in,equity,Financial Services,31.545378,-0.4452,e75cdefa-a7e0-46ba-8a0c-d2574e7bfc8f
2,SBIN.BO,State Bank of India,,,in,equity,Financial Services,31.544556,-0.4452,e75cdefa-a7e0-46ba-8a0c-d2574e7bfc8f
3,SBKFF,State Bank of India,,,us,equity,Financial Services,32.14475,0.4588,7126ca9f-90cf-4a03-b6e9-5ee6d0a0ec9c
4,SBIN.NS,State Bank of India,,,in,equity,Financial Services,32.14463,0.4588,7126ca9f-90cf-4a03-b6e9-5ee6d0a0ec9c


In [63]:
uuidinsbi_entities_similar.shape

(2197, 10)

In [64]:
uuidinsbi_entities_similar['uuid'].nunique()

733

In [65]:
highlights_similar['uuid'].isin(uuidinsbi_entities_similar['uuid'])

0        True
1        True
2        True
3       False
4       False
        ...  
2568    False
2569    False
2570     True
2571    False
2572     True
Name: uuid, Length: 2573, dtype: bool

In [66]:
highlights_similar[highlights_similar['uuid'].isin(uuidinsbi_entities_similar['uuid']) == True]

Unnamed: 0.1,Unnamed: 0,highlight,sentiment,highlighted_in,uuid
0,0,The rise in small-value unsecured loans with a...,-0.4452,main_text,e75cdefa-a7e0-46ba-8a0c-d2574e7bfc8f
1,1,<em>State</em> <em>Bank</em> <em>of</em> <em>I...,0.4588,main_text,7126ca9f-90cf-4a03-b6e9-5ee6d0a0ec9c
2,2,"(up 1.29%), <em>State</em> <em>Bank</em> <em>o...",0.7717,main_text,78644d24-249e-40a2-8151-5320d71eee4f
6,6,Reliance Industries was leading the chart of t...,0.4877,main_text,986ea556-7b6a-458c-86ad-e7c1b9687a60
7,7,The valuation of Reliance Industries jumped ₹9...,0.0000,main_text,986ea556-7b6a-458c-86ad-e7c1b9687a60
...,...,...,...,...,...
2561,2561,Also Read: <em>Axis</em> <em>Bank</em> Fixed D...,0.5719,main_text,af86695f-0983-40be-b4b9-eba62c116c75
2566,2566,NMDC Ltd(down 0.28%) and KIOCL(down 0.20%) wer...,0.2023,main_text,33d7d348-9c03-4c28-95fc-ee956a9f4f5b
2567,2567,"<em>Auto</em>(up 0.55%), Mahindra & Mahindra(u...",0.7717,main_text,33d7d348-9c03-4c28-95fc-ee956a9f4f5b
2570,2570,The board of directors of BharatPe now include...,0.0000,main_text,fe80ab58-39cc-41df-8425-3f15a2db4e42


In [67]:
highlights_similar = highlights_similar[highlights_similar['uuid'].isin(uuidinsbi_entities_similar['uuid']) == True]

In [68]:
highlights_similar.groupby('uuid').agg({
    'highlight' : " ".join
}).reset_index()

Unnamed: 0,uuid,highlight
0,0072285b-bc1d-481b-a6be-e3b2bc72dbc2,He has also worked at the Industrial Developme...
1,009f04c9-aea3-4071-944f-7e280c5c3eeb,"gainers in the Nifty pack.On the other hand, A..."
2,00a13e9c-dc75-4a1a-b226-26c11006adf5,"SBI's ex-chief and chairman, BharatPe, Rajnish..."
3,00f0c29e-3f23-4e0f-b383-f0eb36fa771c,Bajaj Finance was the biggest gainer in the Se...
4,011d361e-62b8-49da-8672-01be9bedba17,"(down 1.53%), State Bank of India(down 1.46%),..."
...,...,...
728,fe4d2ea9-a7d8-44e1-9050-27bd4809f83c,"(up 3.63 per cent), <em>State</em> <em>Bank</e..."
729,fe80ab58-39cc-41df-8425-3f15a2db4e42,The board of directors of BharatPe now include...
730,ff7f48db-7e19-4d19-b1dc-4b0d4aba5b83,"Amid the ongoing unrest in Manipur, <em>State<..."
731,ff956f23-d2a8-4554-a36a-99044489daa5,"Nestle India(up 1.15%), HCL Technologies(up 0...."


### temp_high_similar

In [69]:
temp_high_similar = highlights_similar.groupby('uuid').agg({
    'highlight' : " ".join
}).reset_index()

In [70]:
temp_high_similar.columns

Index(['uuid', 'highlight'], dtype='object')

### temp_ent_similar

In [71]:
temp_ent_similar = uuidinsbi_entities_similar.groupby('uuid').agg({
    'sentiment_score' : 'mean'
}).reset_index()

In [72]:
temp_ent_similar.head(2)

Unnamed: 0,uuid,sentiment_score
0,0072285b-bc1d-481b-a6be-e3b2bc72dbc2,0.0
1,009f04c9-aea3-4071-944f-7e280c5c3eeb,0.3548


In [73]:
temp_ent_similar.merge(temp_high_similar, on = 'uuid')

Unnamed: 0,uuid,sentiment_score,highlight
0,0072285b-bc1d-481b-a6be-e3b2bc72dbc2,0.00000,He has also worked at the Industrial Developme...
1,009f04c9-aea3-4071-944f-7e280c5c3eeb,0.35480,"gainers in the Nifty pack.On the other hand, A..."
2,00a13e9c-dc75-4a1a-b226-26c11006adf5,0.53670,"SBI's ex-chief and chairman, BharatPe, Rajnish..."
3,00f0c29e-3f23-4e0f-b383-f0eb36fa771c,0.00000,Bajaj Finance was the biggest gainer in the Se...
4,011d361e-62b8-49da-8672-01be9bedba17,0.00000,"(down 1.53%), State Bank of India(down 1.46%),..."
...,...,...,...
728,fe4d2ea9-a7d8-44e1-9050-27bd4809f83c,0.20230,"(up 3.63 per cent), <em>State</em> <em>Bank</e..."
729,fe80ab58-39cc-41df-8425-3f15a2db4e42,0.00000,The board of directors of BharatPe now include...
730,ff7f48db-7e19-4d19-b1dc-4b0d4aba5b83,0.16525,"Amid the ongoing unrest in Manipur, <em>State<..."
731,ff956f23-d2a8-4554-a36a-99044489daa5,0.66455,"Nestle India(up 1.15%), HCL Technologies(up 0...."


In [74]:
temp = temp_ent_similar.merge(temp_high_similar, on = 'uuid')

### meta_similar_unique

In [75]:
meta_similar_unique.head(2)

Unnamed: 0,published_at,uuid,title,description,snippet,url,image_url,language,source,relevance_score
0,2024-09-18 12:02:18,e75cdefa-a7e0-46ba-8a0c-d2574e7bfc8f,Rise In Unsecured Small Ticket Loans Not Alarm...,The rise in small value unsecured loans with a...,The rise in small-value unsecured loans with a...,https://www.ndtvprofit.com/economy-finance/sbi...,https://media.assettype.com/bloombergquint%2F2...,en,bloombergquint.com,
1,2024-09-18 12:02:18,7126ca9f-90cf-4a03-b6e9-5ee6d0a0ec9c,"SBI Raises Rs 7,500 Crore Via Second Tranche O...","State Bank of India bagged Rs 7,500 crore thro...","State Bank of India bagged Rs 7,500 crore thro...",https://www.ndtvprofit.com/business/sbi-raises...,https://media.assettype.com/bloombergquint%2F2...,en,bloombergquint.com,


In [76]:
temp.merge(meta_similar_unique[['uuid', 'title', 'description']], on = 'uuid')

Unnamed: 0,uuid,sentiment_score,highlight,title,description
0,0072285b-bc1d-481b-a6be-e3b2bc72dbc2,0.00000,He has also worked at the Industrial Developme...,Sundararaman Ramamurthy takes charge as managi...,Sundararaman Ramamurthy has assumed charge as ...
1,009f04c9-aea3-4071-944f-7e280c5c3eeb,0.35480,"gainers in the Nifty pack.On the other hand, A...",Stock Market Sectors: Stock market update: FMC...,The 30-share BSE Sensex was up 267.64 points...
2,00a13e9c-dc75-4a1a-b226-26c11006adf5,0.53670,"SBI's ex-chief and chairman, BharatPe, Rajnish...","'Peeche Dekho, Aage Badho': How ex-SBI chief R...",
3,00f0c29e-3f23-4e0f-b383-f0eb36fa771c,0.00000,Bajaj Finance was the biggest gainer in the Se...,"Sensex gains 267 pts, Nifty ends at 19,393 on ...",Equity indices Sensex & Nifty rose on Mon. due...
4,011d361e-62b8-49da-8672-01be9bedba17,0.00000,"(down 1.53%), State Bank of India(down 1.46%),...",Stock Market Sectors: Stock market update: Sug...,The 30-share BSE Sensex was down 284.45 poin...
...,...,...,...,...,...
728,fe4d2ea9-a7d8-44e1-9050-27bd4809f83c,0.20230,"(up 3.63 per cent), <em>State</em> <em>Bank</e...",Share market update: Most active stocks on D-S...,The NSE Nifty index was trading 31.35 points ...
729,fe80ab58-39cc-41df-8425-3f15a2db4e42,0.00000,The board of directors of BharatPe now include...,BharatPe appoints two new independent directors,The two appointed independent directors are BP...
730,ff7f48db-7e19-4d19-b1dc-4b0d4aba5b83,0.16525,"Amid the ongoing unrest in Manipur, <em>State<...",Manipur violence: State Bank of India offers s...,"Amid the ongoing unrest in Manipur, State Bank..."
731,ff956f23-d2a8-4554-a36a-99044489daa5,0.66455,"Nestle India(up 1.15%), HCL Technologies(up 0....",Stock Market Sectors: Stock market update: Pow...,The 30-share BSE Sensex was down 166.47 poin...


In [77]:
t = temp.merge(meta_similar_unique[['uuid', 'title', 'description']], on = 'uuid')

In [78]:
t['article'] = t['title']+' '+t['description']+' '+t['highlight']

In [79]:
t.drop(columns = ['highlight', 'title', 'description'], axis = 1, inplace = True)

In [80]:
t.head()

Unnamed: 0,uuid,sentiment_score,article
0,0072285b-bc1d-481b-a6be-e3b2bc72dbc2,0.0,Sundararaman Ramamurthy takes charge as managi...
1,009f04c9-aea3-4071-944f-7e280c5c3eeb,0.3548,Stock Market Sectors: Stock market update: FMC...
2,00a13e9c-dc75-4a1a-b226-26c11006adf5,0.5367,
3,00f0c29e-3f23-4e0f-b383-f0eb36fa771c,0.0,"Sensex gains 267 pts, Nifty ends at 19,393 on ..."
4,011d361e-62b8-49da-8672-01be9bedba17,0.0,Stock Market Sectors: Stock market update: Sug...


### Forming the final dataset

In [81]:
df = pd.concat([df, t], axis = 0)

In [82]:
df.isnull().sum()

published_at       733
uuid                 0
sentiment_score      0
article             81
dtype: int64

In [83]:
df = df.dropna().drop_duplicates()

In [84]:
df = df.reset_index()

In [85]:
df.drop(columns = 'index', inplace = True)

In [86]:
df['article'][0]

'Banking system low-cost deposits could decline further, says SBI chairman Setty SBI’s Casa ratio stood at 40.7% as on 30 June, down from 42.9% from the same period last year. Its loan book expanded by a robust 15.4% year-on-year, outpacing deposit growth of 8.2% in the June quarter. Mumbai: The share of low-cost deposits in the banking system could decline further and go below the levels seen before covid-19 on the back of efficient cash management by the government, the chairman of India’s largest lender <em>State</em> <em>Bank</em> <em>of</em> <em>India</em> (SBI) C.S. Setty said.'

# Cleaning

### Lemmatization using NLTK

In [87]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')

lemmatizer = WordNetLemmatizer()

def get_wordnet_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {
        'J' : wordnet.ADJ, 
        'N' : wordnet.NOUN,
        'V' : wordnet.VERB,
        'R' : wordnet.ADV
    }
    return tag_dict.get(tag, wordnet.NOUN)

sentence = 'Banking system low-cost deposit could decline far , say SBI chairman Setty SBI ’ s Casa ratio stood at 40.7 % a on 30 June , down from 42.9 % from the same period last year . Its loan book expand by a robust 15.4 % year-on-year , outpace deposit growth of 8.2 % in the June quarter . Mumbai : The share of low-cost deposit in the banking system could decline far and go below the level see before covid-19 on the back of efficient cash management by the government , the chairman of India ’ s large lender < em > State < /em > < em > Bank < /em > < em > of < /em > < em > India < /em > ( SBI ) C.S . Setty say .'

lemm_sentence = [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in nltk.word_tokenize(sentence)]

" ".join(lemm_sentence)

[nltk_data] Downloading package wordnet to C:\Users\PRASHANT
[nltk_data]     KAJAL\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to C:\Users\PRASHANT
[nltk_data]     KAJAL\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\PRASHANT KAJAL\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\PRASHANT KAJAL\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


'Banking system low-cost deposit could decline far , say SBI chairman Setty SBI ’ s Casa ratio stood at 40.7 % a on 30 June , down from 42.9 % from the same period last year . Its loan book expand by a robust 15.4 % year-on-year , outpace deposit growth of 8.2 % in the June quarter . Mumbai : The share of low-cost deposit in the banking system could decline far and go below the level see before covid-19 on the back of efficient cash management by the government , the chairman of India ’ s large lender < em > State < /em > < em > Bank < /em > < em > of < /em > < em > India < /em > ( SBI ) C.S . Setty say .'

### Lemmatization using spaCy

In [93]:
!pip install --upgrade spacy



In [96]:
!pip install blis thinc



In [97]:
import spacy

nlp = spacy.load('en_core_web_sm')
doc = nlp(sentence)

lemm_sentence = [token.lemma_ for token in doc]

' '.join(lemm_sentence)

ValueError: BLIS support requires blis: pip install blis

### Lemmatization

In [None]:
ps = PorterStemmer()
puncts = string.punctuation
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
tags = re.compile(r'<.*?>')
remove_non_alpha = re.compile(r'\W')
spaces = re.compile(r'\s+')
not_alpha_numeric = re.compile(r'[^a-zA-Z0-9\s]')
urls = re.compile(r'http\S+|www\S+')
remove_non_ascii = re.compile(r'[^\x00-\x7F]+')
mentions = re.compile(r'[@#]\w+')

nlp = spacy.load('en_core_web_sm')

def get_wordnet_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {
        'J' : wordnet.ADJ, 
        'N' : wordnet.NOUN,
        'V' : wordnet.VERB,
        'R' : wordnet.ADV
    }
    return tag_dict.get(tag, wordnet.NOUN)

def cleaning(text, stemming = True):
    text = text.lower()
    text = tags.sub('', text)
#     text = remove_notn_alpha.sub(, ' ', text) # we don't want to lose the numerical information
    text = spaces.sub(' ', text)
    text = not_alpha_numeric.sub(' ', text)
    text = urls.sub('', text)
#     text = re.sub(r'\d+', '', text)
    text = remove_non_ascii.sub('', text) # to remove non-ascii characters
    text = mentions.sub('', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    tokens = word_tokenize(text)
    
    # Stemming
    if stemming:
        s = [word for word in tokens if word not in stop_words]
        s = [ps.stem(word) for word in s]
        
    # Lemmatization
    else:
        doc = nlp(text)
        s = [token.lemma_ for token in doc]
        
    return ' '.join(s)

In [None]:
cleaning(df['article'][0], stemming=True)

In [None]:
cleaning(df['article'][0], stemming=False)

#### Enabling tqdm for pandas

In [None]:
tqdm.pandas()

In [None]:
df['article'] = df['article'].progress_apply(lambda x: cleaning(x, stemming = False))

In [567]:
df['article'][0]

'banking system low cost deposit could decline far   say sbi chairman setty sbi s casa ratio stand at 40 7   as on 30 june   down from 42 9   from the same period last year   its loan book expand by a robust 15 4   year on year   outpace deposit growth of 8 2   in the june quarter   mumbai   the share of low cost deposit in the banking system could decline far and go below the level see before covid 19 on the back of efficient cash management by the government   the chairman of india s large lender state bank of india   sbi   c s   setty say'

# feature engineering 

In [568]:
df.head()

Unnamed: 0,uuid,sentiment_score,article
0,bb5429b2-be36-4088-b04c-9f17e9003d2d,0.6124,banking system low cost deposit could decline ...
1,0b24d613-2705-4283-b45e-3b8942fc5762,0.0,state bank of india raise rs 7 500 cr at 7 33 ...
2,9bcbd8e3-7f97-4d00-aae8-1f08f1a674fd,0.296,sbi raise 15 000 cr via basel iii bond in a ...
3,0d05c075-706d-4a5e-920c-ae39a41b420b,0.7717,stock market sector stock market update mi...
4,6bf48528-aaf6-4c9d-93b8-303c23033b91,0.3612,bank casa ratio may fall far as cos govt c...


In [569]:
df.describe()

Unnamed: 0,sentiment_score
count,13231.0
mean,0.282974
std,0.391764
min,-0.9601
25%,0.0
50%,0.34
75%,0.588175
max,0.9764


In [570]:
df['sentiment_score'] = np.round(df['sentiment_score'], 4)

### TFIDF

In [571]:
# max_df = [1.0]
# min_df = [1.0, 0.9, 0.8, 0.7, 0.6]
# max_features : default = None

tfidf = TfidfVectorizer(sublinear_tf = True, max_features = 1000)
tfidf.fit(df['article'])

In [572]:
tf_data = tfidf.transform(df['article']).toarray()

In [573]:
tfidf.idf_

array([5.28090726, 3.23975791, 4.64975176, 4.72834203, 4.50897921,
       4.62960719, 4.1932841 , 4.54759404, 4.50897921, 4.49892887,
       4.23848953, 2.86431066, 4.646849  , 3.63287935, 3.49371193,
       3.87299044, 3.86367567, 3.12965351, 3.76776362, 3.89188439,
       3.91253206, 3.92372099, 6.00175705, 3.7872053 , 5.11511501,
       5.42148922, 5.13853528, 4.92204891, 4.01342105, 3.51031748,
       3.85313539, 4.05906234, 3.90009237, 4.02892524, 3.94648157,
       3.47378373, 4.25011757, 4.20253486, 4.28988424, 4.19144417,
       2.75277713, 5.58511864, 3.97716331, 4.54235843, 4.50394141,
       4.39231914, 3.94504376, 4.54759404, 4.62676224, 4.50394141,
       4.58776008, 4.41504739, 5.31990942, 4.4573072 , 4.40817451,
       4.54759404, 4.39231914, 3.87969737, 4.56080427, 4.59599058,
       4.4453881 , 4.57149956, 3.6276355 , 4.68525845, 4.60707103,
       4.2919147 , 4.568815  , 4.501432  , 4.18960762, 4.55022216,
       4.52168586, 4.68827504, 4.53455605, 4.48404026, 5.63058

In [574]:
tfidf.vocabulary_

{'banking': 183,
 'system': 879,
 'low': 576,
 'cost': 286,
 'deposit': 317,
 'could': 287,
 'decline': 311,
 'far': 384,
 'say': 807,
 'sbi': 808,
 'chairman': 245,
 'setty': 829,
 'ratio': 754,
 'stand': 851,
 'at': 167,
 '40': 51,
 'as': 162,
 'on': 661,
 '30': 40,
 'june': 532,
 'down': 336,
 'from': 419,
 '42': 54,
 'the': 898,
 'same': 803,
 'period': 696,
 'last': 548,
 'year': 992,
 'its': 523,
 'loan': 570,
 'book': 206,
 'expand': 372,
 'by': 225,
 'robust': 796,
 '15': 17,
 'growth': 445,
 'of': 654,
 'in': 483,
 'quarter': 745,
 'mumbai': 624,
 'share': 833,
 'and': 148,
 'go': 434,
 'below': 192,
 'level': 557,
 'see': 815,
 'before': 190,
 '19': 21,
 'back': 176,
 'cash': 236,
 'management': 586,
 'government': 437,
 'india': 489,
 'large': 546,
 'lender': 554,
 'state': 854,
 'bank': 181,
 'raise': 748,
 'rs': 797,
 '500': 63,
 'cr': 291,
 '33': 44,
 'via': 956,
 'tier': 910,
 'bond': 205,
 'issuance': 518,
 'this': 905,
 'follow': 408,
 'crore': 294,
 'august': 170,
 '2

In [575]:
terms = []
freq = []

for k, v in tfidf.vocabulary_.items():
    terms.append(k)
    freq.append(v)

#### vocabulary size and minimum and highest frequencies

In [576]:
print("vocabulary : ", len(terms))
print(f"Highest : {max(freq)}\t lowest : {min(freq)}")

vocabulary :  1000
Highest : 999	 lowest : 0


#### out of 5000 values in a vector, only 61 Non-zero values are present for first sample

In [577]:
len([i for i in tf_data[0] if i != 0])

57

### n-grams

In [578]:
# ngrams : default = (1, 1)
# max_features : default = None

# considering the bigrams, trigrams and 4-grams
ngrams = TfidfVectorizer(sublinear_tf=True, ngram_range = (2, 3), max_features = 1000)
ngrams.fit(df['article'])

#### without setting max_features, vocabulary is 1514317 for (2, 4)

In [579]:
len([k for k in ngrams.vocabulary_.keys()])

1000

In [580]:
ng_data = ngrams.transform(df['article']).toarray()

In [581]:
len([i for i in ng_data[0] if i != 0])

29

### CBOW(continous bag-of-words)

In [393]:
!pip install gensim

Collecting gensim
  Downloading gensim-4.3.3-cp39-cp39-win_amd64.whl.metadata (8.2 kB)
Downloading gensim-4.3.3-cp39-cp39-win_amd64.whl (24.0 MB)
   ---------------------------------------- 0.0/24.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/24.0 MB 1.9 MB/s eta 0:00:13
   ---------------------------------------- 0.2/24.0 MB 2.0 MB/s eta 0:00:13
    --------------------------------------- 0.4/24.0 MB 2.9 MB/s eta 0:00:09
    --------------------------------------- 0.6/24.0 MB 3.1 MB/s eta 0:00:08
   - -------------------------------------- 0.8/24.0 MB 3.5 MB/s eta 0:00:07
   - -------------------------------------- 1.1/24.0 MB 3.7 MB/s eta 0:00:07
   -- ------------------------------------- 1.3/24.0 MB 3.9 MB/s eta 0:00:06
   -- ------------------------------------- 1.5/24.0 MB 4.0 MB/s eta 0:00:06
   -- ------------------------------------- 1.7/24.0 MB 4.2 MB/s eta 0:00:06
   --- ------------------------------------ 1.9/24.0 MB 4.2 MB/s eta 0:00:06
   --- -------

In [394]:
import gensim
from gensim.models import Word2Vec

In [413]:
# vector_size
# window
# min_count : ignore, having frequency below than this
# epochs
# alpha
# negative : default = 5 (negative sampling for each positive class)

# sg = 0 : CBOW ; sg = 1 : skip-gram
# hs = 1 : softmax will be used; hs = 0 : softmax will not be used
# epochs : default = 5

def tokenize(text):
    data = [word for word in word_tokenize(text) if word not in stop_words]
    return data

data = df['article'].progress_apply(tokenize)

100%|████████████████████████████████████████| 13231/13231 [04:36<00:00, 47.85it/s]


In [415]:
cbow = Word2Vec(data, vector_size=200, workers=8, window = 5, sg = 0)

In [416]:
cbow.wv.index_to_key

['bank',
 'india',
 'state',
 '1',
 'sbi',
 '0',
 'market',
 'stock',
 'rs',
 'crore',
 'per',
 'cent',
 'ltd',
 '2',
 'share',
 'hdfc',
 'nifty',
 'point',
 'rate',
 'top',
 'year',
 'say',
 'sensex',
 '5',
 '3',
 'mahindra',
 'loan',
 'among',
 '7',
 'company',
 'bse',
 '10',
 'close',
 'icici',
 'lender',
 'tata',
 'sector',
 '6',
 'index',
 'interest',
 'industry',
 '4',
 'update',
 '30',
 'reliance',
 'trade',
 'high',
 'day',
 'deposit',
 '000',
 'bond',
 'gainer',
 'adani',
 'indian',
 'finance',
 'fund',
 'large',
 '8',
 'include',
 'end',
 'bajaj',
 'report',
 'rise',
 'price',
 '15',
 'financial',
 'axis',
 'fall',
 'rbi',
 'new',
 'offer',
 'national',
 'firm',
 'also',
 'growth',
 'raise',
 'service',
 'power',
 'pack',
 'time',
 'corporation',
 'steel',
 'country',
 'last',
 'banking',
 '9',
 '25',
 'itc',
 'kotak',
 'value',
 '2023',
 'lakh',
 'account',
 '50',
 'baroda',
 'credit',
 'public',
 'profit',
 'lead',
 'first',
 'punjab',
 'payment',
 'hand',
 'airtel',
 'grou

In [417]:
cbow.wv.most_similar('covid')

[('pandemic', 0.8121364116668701),
 ('contraction', 0.713736891746521),
 ('vaccination', 0.7078325748443604),
 ('slow', 0.7073777318000793),
 ('robust', 0.6949485540390015),
 ('steady', 0.6739208698272705),
 ('tepid', 0.6675041317939758),
 ('pre', 0.6656627655029297),
 ('cycle', 0.6655044555664062),
 ('drive', 0.6619682312011719)]

In [422]:
cbow.wv.most_similar('bank', topn=20)

[('canara', 0.6248539686203003),
 ('bob', 0.5749539732933044),
 ('baroda', 0.573883056640625),
 ('namely', 0.5660092830657959),
 ('lender', 0.5627161860466003),
 ('pnb', 0.5493518114089966),
 ('graphite', 0.5457668304443359),
 ('union', 0.544642448425293),
 ('hdfc', 0.5309991240501404),
 ('vs', 0.5242079496383667),
 ('idbi', 0.5222285389900208),
 ('axis', 0.5140102505683899),
 ('yes', 0.5087817907333374),
 ('uco', 0.5080928802490234),
 ('rbl', 0.5074078440666199),
 ('icici', 0.4936247766017914),
 ('idfc', 0.48486122488975525),
 ('punjab', 0.47666043043136597),
 ('four', 0.47547903656959534),
 ('overseas', 0.4706997871398926)]

In [424]:
cbow.wv.doesnt_match(['hdfc', 'sbi', 'bank', 'axis', 'recommendation'])

'recommendation'

In [425]:
cbow.wv.get_vector('sbi')

array([-0.3419571 ,  0.942922  ,  0.12081295,  0.9770466 ,  0.29877862,
       -0.9295656 ,  0.44010848, -2.0481694 , -1.2386979 , -0.09676816,
       -0.27453548,  0.14477713,  0.9966579 ,  0.6152951 ,  1.4462123 ,
        0.19414756, -0.18690403, -0.14078994, -1.1615467 , -0.18985328,
       -0.16481602,  0.69414574,  0.19649014, -0.23171845,  0.50778353,
       -0.30567443,  0.23595282,  0.3195502 , -0.4107982 , -0.24158575,
        0.7417271 , -0.33517244,  0.77078134, -0.44916627, -0.10828814,
        1.5214555 , -0.32589406,  0.8483331 , -0.7042748 , -1.6357707 ,
       -0.7766172 ,  0.6006711 , -1.1062084 ,  0.06038225, -0.2380952 ,
        0.00388834,  0.15305936,  0.7452553 , -0.84791684, -0.6403822 ,
        0.17754169, -0.30600974,  1.5978382 ,  0.01444121,  0.716702  ,
       -0.7857714 , -0.70032734, -0.84229   ,  0.23330039, -0.47602293,
       -0.01805501, -0.2784835 , -0.30944204, -0.09665971, -0.634286  ,
        0.85298336,  0.46891588,  0.595928  , -0.76080734, -0.42

### skip-grams

In [435]:
sgram = Word2Vec(data, vector_size=200, workers=8, window = 5, sg = 1, hs = 0)

In [437]:
sgram.wv.index_to_key

['bank',
 'india',
 'state',
 '1',
 'sbi',
 '0',
 'market',
 'stock',
 'rs',
 'crore',
 'per',
 'cent',
 'ltd',
 '2',
 'share',
 'hdfc',
 'nifty',
 'point',
 'rate',
 'top',
 'year',
 'say',
 'sensex',
 '5',
 '3',
 'mahindra',
 'loan',
 'among',
 '7',
 'company',
 'bse',
 '10',
 'close',
 'icici',
 'lender',
 'tata',
 'sector',
 '6',
 'index',
 'interest',
 'industry',
 '4',
 'update',
 '30',
 'reliance',
 'trade',
 'high',
 'day',
 'deposit',
 '000',
 'bond',
 'gainer',
 'adani',
 'indian',
 'finance',
 'fund',
 'large',
 '8',
 'include',
 'end',
 'bajaj',
 'report',
 'rise',
 'price',
 '15',
 'financial',
 'axis',
 'fall',
 'rbi',
 'new',
 'offer',
 'national',
 'firm',
 'also',
 'growth',
 'raise',
 'service',
 'power',
 'pack',
 'time',
 'corporation',
 'steel',
 'country',
 'last',
 'banking',
 '9',
 '25',
 'itc',
 'kotak',
 'value',
 '2023',
 'lakh',
 'account',
 '50',
 'baroda',
 'credit',
 'public',
 'profit',
 'lead',
 'first',
 'punjab',
 'payment',
 'hand',
 'airtel',
 'grou

In [438]:
sgram.wv.doesnt_match(['bank', 'prepare', 'work', 'government'])

'work'

In [439]:
sgram.wv.most_similar('sebi', topn=10)

[('regulator', 0.7062761187553406),
 ('drhp', 0.6780863404273987),
 ('franklin', 0.6707900762557983),
 ('violation', 0.6625649333000183),
 ('templeton', 0.6589047312736511),
 ('allegation', 0.6474838852882385),
 ('nsdl', 0.645271897315979),
 ('investigate', 0.6427587270736694),
 ('probe', 0.6385255455970764),
 ('irregularity', 0.6377963423728943)]

#### creating vectors for CBOW and skip-grams

Alternate approach to create vectors for the whole sentence is to take the 'tfidf' weights for each word instead of computing the element-wise mean for all the words in a sentence

In [618]:
def sent_to_vector(tokens, model):
    word_vectors = [model.wv.get_vector(word) for word in tokens if word in model.wv.key_to_index]
    
    if len(word_vectors) > 0:
        emb_vector = np.mean(word_vectors, axis = 0)
    else:
        emb_vector = np.zeros(model.vector_size)
    return emb_vector

In [619]:
cbow_data = data.apply(lambda x : sent_to_vector(x, cbow))

In [620]:
sgram_data = data.apply(lambda x : sent_to_vector(x, sgram))

In [621]:
cbow_data

0        [-0.120576695, 0.27348927, -0.19028719, 0.0396...
1        [0.02034735, 0.092150845, -0.6677955, -0.05301...
2        [0.054780457, 0.16940472, -0.60882956, 0.05160...
3        [-0.23958161, 0.18202233, 0.23554286, -0.32710...
4        [-0.19849703, 0.24552888, -0.14863239, 0.29899...
                               ...                        
13226    [-0.2725245, 0.4958529, -0.1313842, -0.3491033...
13227    [-0.329945, 0.12768652, 0.077850215, 0.0956744...
13228    [-0.1294556, 0.22967891, -0.08980837, 0.134809...
13229    [-0.16898939, 0.2631964, 0.3120264, -0.4492970...
13230    [-0.044863816, 0.009695958, -0.11595442, 0.247...
Name: article, Length: 13231, dtype: object

## GloVe

### Using pre-trained Embeddings

In [442]:
import gensim.downloader as api

w2v_model = api.load("word2vec-google-news-300")



In [443]:
import spacy

In [444]:
!python -m spacy download en_core_web_md

Collecting en-core-web-md==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.7.1/en_core_web_md-3.7.1-py3-none-any.whl (42.8 MB)
     ---------------------------------------- 0.0/42.8 MB ? eta -:--:--
     --------------------------------------- 0.0/42.8 MB 960.0 kB/s eta 0:00:45
     --------------------------------------- 0.0/42.8 MB 960.0 kB/s eta 0:00:45
     --------------------------------------- 0.0/42.8 MB 960.0 kB/s eta 0:00:45
     --------------------------------------- 0.1/42.8 MB 774.0 kB/s eta 0:00:56
     ---------------------------------------- 0.3/42.8 MB 1.4 MB/s eta 0:00:31
      --------------------------------------- 0.6/42.8 MB 2.1 MB/s eta 0:00:21
      --------------------------------------- 0.8/42.8 MB 2.3 MB/s eta 0:00:18
      --------------------------------------- 1.0/42.8 MB 2.8 MB/s eta 0:00:16
     - -------------------------------------- 1.2/42.8 MB 2.9 MB/s eta 0:00:15
     - ------------------------------

In [445]:
glove_model = spacy.load('en_core_web_md')

In [448]:
glove_model.vocab['bank'].vector

array([-4.7267e+00, -3.2360e+00, -7.3710e+00, -1.0489e-01,  3.9620e+00,
        3.6566e+00,  2.7841e+00,  5.8755e-01,  2.3712e+00,  3.5601e+00,
        3.5018e+00,  3.3417e+00, -3.3269e+00,  4.9772e+00, -4.6630e+00,
        5.1539e-01,  1.5147e+00,  1.7821e+00,  2.6395e+00,  1.5738e+00,
        2.7575e+00,  1.3967e+00, -2.6283e+00,  3.8107e+00, -2.5779e+00,
       -1.9897e+00, -1.0475e+00, -5.3496e+00,  3.6473e+00, -4.2020e+00,
        8.3994e-01,  3.3382e+00, -4.0465e+00,  1.6089e+00,  2.7818e+00,
       -1.5335e+00, -4.9039e+00, -7.3172e-01,  7.8021e+00,  4.0585e+00,
       -4.5176e+00,  9.9998e-01, -2.3385e+00, -1.1560e-01,  6.0838e+00,
       -2.3091e+00,  8.2888e+00, -3.1598e+00, -1.5010e+00,  3.5432e+00,
       -1.9039e+00,  2.6200e+00, -1.0613e+00, -3.3889e+00, -4.2885e-01,
       -1.1062e+00,  2.6952e+00,  2.6195e-01, -1.4376e+00, -2.4823e-01,
        7.8905e+00, -2.9588e+00,  2.4996e+00, -7.9778e+00,  1.4914e+00,
       -5.4918e+00, -4.1051e-01, -3.4404e+00, -2.1738e+00,  3.76

In [449]:
def get_embedding(text):
    emb = glove_model(text)
    return emb.vector

In [455]:
glove_data = df['article'].progress_apply(lambda x : get_embedding(x))

100%|████████████████████████████████████████| 13231/13231 [03:32<00:00, 62.19it/s]


#### Internally GloVe takes mean for the all the words in a sentence passed to it

In [464]:
text1 = glove_model('all the news are for sbi').vector
text2 = glove_model('collected data for more almost 4 years for the same company').vector
text3 = glove_model('all the news are for sbi. collected data for more almost 4 years for the same company').vector

In [465]:
mean_vector = (text1 + text2)/2

In [470]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_similarity([mean_vector], [text3])

array([[0.987544]], dtype=float32)

In [660]:
glove_data

0        [-1.5232313, -1.3452632, -1.7712574, 1.138419,...
1        [-1.7960392, -2.0483503, -2.6426113, 1.5941111...
2        [-2.5876038, -1.4517138, -2.502686, 1.2334776,...
3        [-0.49346185, -2.2923295, -1.2826152, 2.531905...
4        [-0.9686833, -0.20538887, -2.265118, 1.1455958...
                               ...                        
13226    [-1.1346107, -1.7636762, -1.3233956, 2.0763075...
13227    [-3.0870767, -1.1419549, -0.61456317, 0.974234...
13228    [-1.7394482, 0.7237023, -3.0263448, 1.3411535,...
13229    [-0.5185542, -2.3214946, -1.1179844, 2.8681738...
13230    [-1.6557447, 1.260637, -2.5989487, 0.47640955,...
Name: article, Length: 13231, dtype: object

#### We can generate out dataset specific embeddings also

# Model Building 

In [664]:
def correct_data(x_t):
    x = x_t.to_list()
    x = [i.tolist() for i in x]
    return x

#### y_train, y_test

In [582]:
y_train = df['sentiment_score'][:10000]
y_test = df['sentiment_score'][10000:]

#### tfidf: x_train_tf and x_test_tf

In [583]:
x_train_tf = tf_data[:10000]
x_test_tf = tf_data[10000:]

#### ngrams: x_train_ngram and x_test_ngram

In [584]:
x_train_ngram = ng_data[:10000]
x_test_ngram = ng_data[10000:]

#### CBOW: x_train_cbow and x_test_cbow

In [665]:
x_train_cbow = cbow_data[:10000]
x_train_cbow = correct_data(x_train_cbow)
x_test_cbow = cbow_data[10000:]
x_test_cbow = correct_data(x_test_cbow)

#### skip-gram: x_train_sgram and x_test_sgram

In [666]:
x_train_sgram = sgram_data[:10000]
x_train_sgram = correct_data(x_train_sgram)
x_test_sgram = sgram_data[10000:]
x_test_sgram = correct_data(x_test_sgram)

#### GloVe: x_train_glove and x_test_glove

In [667]:
x_train_glove = glove_data[:10000]
x_train_glove = correct_data(x_train_glove)
x_test_glove = glove_data[10000:]
x_test_glove = correct_data(x_test_glove)

#### Creating dictionary for data

In [668]:
# Create a hierarchical dictionary structure
df = {
    'tfidf': {
        'x_train': x_train_tf,
        'x_test': x_test_tf
    },
    'ngrams': {
        'x_train': x_train_ngram,
        'x_test': x_test_ngram
    },
    'cbow': {
        'x_train': x_train_cbow,
        'x_test': x_test_cbow
    },
    'sgram': {
        'x_train': x_train_sgram,
        'x_test': x_test_sgram
    },
    'glove': {
        'x_train': x_train_glove,
        'x_test': x_test_glove
    },
    'y_train' : y_train,
    'y_test' : y_test
    
}

In [669]:
df.keys()

dict_keys(['tfidf', 'ngrams', 'cbow', 'sgram', 'glove', 'y_train', 'y_test'])

### Models 

    Kernel Ridge Regression

    SVM

    RandomForest
    
    Neural Networks

    RNN & LSTM

#### Hyperparameters Tuning

In [13]:
# for future works

### extracting data

In [1]:
import pickle as pkl
with open('dataset.pkl', 'rb') as pkl_file:
    data = pkl.load(pkl_file)

df = data

### Linear Regression

In [6]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from scipy import sparse

In [4]:
np.random.seed(41)

In [10]:
lr_model = LinearRegression(n_jobs = 6)

def Linear_regression(x_train, x_test, y_train, y_test):    
    lr_model.fit(x_train, y_train*1000)
    y_pred = lr_model.predict(x_test)
    
    return (mean_squared_error(y_test*1000, y_pred), mean_absolute_error(y_test*1000, y_pred), r2_score(y_test*1000, y_pred))

In [15]:
for tr in ['tfidf', 'ngrams', 'cbow', 'sgram', 'glove']:
    print(f"\033[4m\033[1m{tr.upper()}\033[0m\033[0m\n")
    
    x_train = df[tr]['x_train']
    x_test = df[tr]['x_test']
    mse, mae, r2 = Linear_regression(x_train, x_test, df['y_train'], df['y_test'])
    print(f"\t\033[1mMSE : \033[0m{mse}\t\033[1mMAE : \033[0m{mae}\t\033[1mR2_score : \033[0m{r2}\n\n")

[4m[1mTFIDF[0m[0m

	[1mMSE : [0m120851.63854833123	[1mMAE : [0m265.19892056446463	[1mR2_score : [0m0.3046476162086339


[4m[1mNGRAMS[0m[0m

	[1mMSE : [0m2.7031007551269847e+26	[1mMAE : [0m1009365486479.4626	[1mR2_score : [0m-1.5553016709443242e+21


[4m[1mCBOW[0m[0m

	[1mMSE : [0m178105.71432544774	[1mMAE : [0m297.9738722374198	[1mR2_score : [0m-0.024779097004425932


[4m[1mSGRAM[0m[0m

	[1mMSE : [0m156532.86123902083	[1mMAE : [0m295.69269602031295	[1mR2_score : [0m0.09934611138344995


[4m[1mGLOVE[0m[0m

	[1mMSE : [0m131810.61699835837	[1mMAE : [0m280.60002424807436	[1mR2_score : [0m0.2415921882419123




### Ridge Regression
alpha

solver : 'auto', 'svd', 'cholesky', 'saga'

Hyperparameter Tuning

In [69]:
from sklearn.linear_model import Ridge

In [81]:
def ridge_reg(x_train, x_test, y_train, y_test):
    model = Ridge(alpha = 1.0)
    model.fit(x_train, y_train*1000)
    y_pred = model.predict(x_test)
    
    return (mean_squared_error(y_test*1000, y_pred), mean_absolute_error(y_test*1000, y_pred), r2_score(y_test*1000, y_pred))

In [84]:
for tr in ['tfidf', 'ngrams', 'cbow', 'sgram', 'glove']:
    print(f"\033[4m\033[1m{tr.upper()}\033[0m\033[0m\n")
    
    x_train = df[tr]['x_train']
    x_test = df[tr]['x_test']
    mse, mae, r2 = ridge_reg(x_train, x_test, df['y_train'], df['y_test'])
    print(f"\t\033[1mMSE : \033[0m{round(mse, 4)}  \033[1mMAE : \033[0m{round(mae, 4)}  \033[1mR2_score : \033[0m{round(r2, 4)}\n\n")

[4m[1mTFIDF[0m[0m

	[1mMSE : [0m138537.0105  [1mMAE : [0m270.0668  [1mR2_score : [0m0.2029


[4m[1mNGRAMS[0m[0m

	[1mMSE : [0m179464.4354  [1mMAE : [0m299.3846  [1mR2_score : [0m-0.0326


[4m[1mCBOW[0m[0m

	[1mMSE : [0m170270.5029  [1mMAE : [0m300.1176  [1mR2_score : [0m0.0203


[4m[1mSGRAM[0m[0m

	[1mMSE : [0m154526.7931  [1mMAE : [0m295.9903  [1mR2_score : [0m0.1109


[4m[1mGLOVE[0m[0m

	[1mMSE : [0m131304.8267  [1mMAE : [0m280.2617  [1mR2_score : [0m0.2445




### Kernel Ridge Regression
alpha

kernel : 'linear', 'poly'(degree), 'rbf'(gamma), 'sigmoid'

coef0 : (for 'poly' and 'sigmoid') : Independent term in the kernel function. It helps control the trade-off between the linear and nonlinear terms.

In [94]:
from sklearn.kernel_ridge import KernelRidge

In [95]:
def krr(x_train, x_test, y_train, y_test):
    model = KernelRidge(alpha = 1.0, kernel = 'rbf')
    model.fit(x_train, y_train*1000)
    y_pred = model.predict(x_test)
    
    return (mean_squared_error(y_test*1000, y_pred), mean_absolute_error(y_test*1000, y_pred), r2_score(y_test*1000, y_pred))
for tr in ['tfidf', 'ngrams', 'cbow', 'sgram', 'glove']:
    print(f"\033[4m\033[1m{tr.upper()}\033[0m\033[0m\n")
    
    x_train = df[tr]['x_train']
    x_test = df[tr]['x_test']
    mse, mae, r2 = krr(x_train, x_test, df['y_train'], df['y_test'])
    print(f"\t\033[1mMSE : \033[0m{round(mse, 4)}  \033[1mMAE : \033[0m{round(mae, 4)}  \033[1mR2_score : \033[0m{round(r2, 4)}\n\n")

[4m[1mTFIDF[0m[0m

	[1mMSE : [0m167864.5285  [1mMAE : [0m321.9036  [1mR2_score : [0m0.0341


[4m[1mNGRAMS[0m[0m

	[1mMSE : [0m168542.6197  [1mMAE : [0m321.124  [1mR2_score : [0m0.0302


[4m[1mCBOW[0m[0m

	[1mMSE : [0m157454.7504  [1mMAE : [0m300.8135  [1mR2_score : [0m0.094


[4m[1mSGRAM[0m[0m

	[1mMSE : [0m153195.9232  [1mMAE : [0m306.9686  [1mR2_score : [0m0.1185


[4m[1mGLOVE[0m[0m

	[1mMSE : [0m138376.839  [1mMAE : [0m281.1349  [1mR2_score : [0m0.2038




### SVR
C : regularization Parameter(default = 1.0)

epsilon(default = 0.1) : Specifies the epsilon-tube within which no penalty           is associated in the training loss function.

kernel : 'linear', 'poly'(degree), 'rbf'(gamma), 'sigmoid'

In [2]:
from sklearn.svm import SVR

In [6]:
def svr(x_train, x_test, y_train, y_test):
    model = SVR(kernel = 'linear', C=1.0, epsilon=1.0)
    model.fit(x_train, y_train*1000)
    y_pred = model.predict(x_test)
    
    return (mean_squared_error(y_test*1000, y_pred), mean_absolute_error(y_test*1000, y_pred), r2_score(y_test*1000, y_pred))
for tr in ['tfidf', 'ngrams', 'cbow', 'sgram', 'glove']:
    print(f"\033[4m\033[1m{tr.upper()}\033[0m\033[0m\n")
    
    x_train = df[tr]['x_train']
    x_test = df[tr]['x_test']
    mse, mae, r2 = svr(x_train, x_test, df['y_train'], df['y_test'])
    print(f"\t\033[1mMSE : \033[0m{round(mse, 4)}  \033[1mMAE : \033[0m{round(mae, 4)}  \033[1mR2_score : \033[0m{round(r2, 4)}\n\n")

[4m[1mTFIDF[0m[0m

	[1mMSE : [0m171933.9742  [1mMAE : [0m316.9107  [1mR2_score : [0m0.0107


[4m[1mNGRAMS[0m[0m

	[1mMSE : [0m172974.0581  [1mMAE : [0m317.2933  [1mR2_score : [0m0.0047


[4m[1mCBOW[0m[0m

	[1mMSE : [0m164206.4428  [1mMAE : [0m308.9839  [1mR2_score : [0m0.0552


[4m[1mSGRAM[0m[0m

	[1mMSE : [0m161852.8288  [1mMAE : [0m309.8637  [1mR2_score : [0m0.0687


[4m[1mGLOVE[0m[0m

	[1mMSE : [0m144410.0952  [1mMAE : [0m287.5804  [1mR2_score : [0m0.1691




In [7]:
def svr(x_train, x_test, y_train, y_test):
    model = SVR(kernel = 'rbf', C=1.0, epsilon=1.0)
    model.fit(x_train, y_train*1000)
    y_pred = model.predict(x_test)
    
    return (mean_squared_error(y_test*1000, y_pred), mean_absolute_error(y_test*1000, y_pred), r2_score(y_test*1000, y_pred))
for tr in ['tfidf', 'ngrams', 'cbow', 'sgram', 'glove']:
    print(f"\033[4m\033[1m{tr.upper()}\033[0m\033[0m\n")
    
    x_train = df[tr]['x_train']
    x_test = df[tr]['x_test']
    mse, mae, r2 = svr(x_train, x_test, df['y_train'], df['y_test'])
    print(f"\t\033[1mMSE : \033[0m{round(mse, 4)}  \033[1mMAE : \033[0m{round(mae, 4)}  \033[1mR2_score : \033[0m{round(r2, 4)}\n\n")

[4m[1mTFIDF[0m[0m

	[1mMSE : [0m177257.9911  [1mMAE : [0m322.3641  [1mR2_score : [0m-0.0199


[4m[1mNGRAMS[0m[0m

	[1mMSE : [0m176576.094  [1mMAE : [0m321.3702  [1mR2_score : [0m-0.016


[4m[1mCBOW[0m[0m

	[1mMSE : [0m170577.1649  [1mMAE : [0m314.9122  [1mR2_score : [0m0.0185


[4m[1mSGRAM[0m[0m

	[1mMSE : [0m173136.9753  [1mMAE : [0m318.6445  [1mR2_score : [0m0.0038


[4m[1mGLOVE[0m[0m

	[1mMSE : [0m176068.0577  [1mMAE : [0m321.9092  [1mR2_score : [0m-0.0131




### RandomForestRegressor
n_estimators : The number of trees in the forest. More trees can improve performance but increase computation time. : 100

max_depth : The maximum depth of each tree. Limits the growth of trees to control overfitting. : None

min_samples_split : The minimum number of samples required to split an internal node. Higher values prevent overfitting.: 2

min_samples_leaf : The minimum number of samples required to be at a leaf node. Higher values prevent splits that result in nodes with few samples.: 1

max_features : The number of features to consider when looking for the best split. Options include an integer, float (fraction), 'sqrt', or 'log2'.	None

bootstrap : Whether to use bootstrap samples when building trees. If False, the whole dataset is used to build each tree. : True

criterion : The function to measure the quality of a split. Options are 'squared_error' (for regression) or 'absolute_error'.	'squared_error'
random_state	Controls the randomness of the estimator for reproducibility. : None

max_samples : If bootstrap is True, the number of samples to draw from X to train each base estimator : None

In [8]:
from sklearn.ensemble import RandomForestRegressor

In [9]:
def rf(x_train, x_test, y_train, y_test):
    model = RandomForestRegressor(n_estimators = 100, random_state = 42, n_jobs=6, verbose = 1, warm_start=True)
    model.fit(x_train, y_train*1000)
    y_pred = model.predict(x_test)
    
    return (mean_squared_error(y_test*1000, y_pred), mean_absolute_error(y_test*1000, y_pred), r2_score(y_test*1000, y_pred))
for tr in ['tfidf', 'ngrams', 'cbow', 'sgram', 'glove']:
    print(f"\033[4m\033[1m{tr.upper()}\033[0m\033[0m\n")
    
    x_train = df[tr]['x_train']
    x_test = df[tr]['x_test']
    mse, mae, r2 = rf(x_train, x_test, df['y_train'], df['y_test'])
    print(f"\t\033[1mMSE : \033[0m{round(mse, 4)}  \033[1mMAE : \033[0m{round(mae, 4)}  \033[1mR2_score : \033[0m{round(r2, 4)}\n\n")

[4m[1mTFIDF[0m[0m



[Parallel(n_jobs=6)]: Using backend ThreadingBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:   56.1s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:  2.5min finished
[Parallel(n_jobs=6)]: Using backend ThreadingBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=6)]: Using backend ThreadingBackend with 6 concurrent workers.


	[1mMSE : [0m113502.3309  [1mMAE : [0m243.9789  [1mR2_score : [0m0.3469


[4m[1mNGRAMS[0m[0m



[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:  1.5min
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:  3.6min finished
[Parallel(n_jobs=6)]: Using backend ThreadingBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=6)]: Using backend ThreadingBackend with 6 concurrent workers.


	[1mMSE : [0m136664.9528  [1mMAE : [0m269.7001  [1mR2_score : [0m0.2137


[4m[1mCBOW[0m[0m



[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:  1.4min
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:  3.6min finished
[Parallel(n_jobs=6)]: Using backend ThreadingBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=6)]: Using backend ThreadingBackend with 6 concurrent workers.


	[1mMSE : [0m132316.4438  [1mMAE : [0m274.299  [1mR2_score : [0m0.2387


[4m[1mSGRAM[0m[0m



[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:  1.4min
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:  3.5min finished
[Parallel(n_jobs=6)]: Using backend ThreadingBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=6)]: Using backend ThreadingBackend with 6 concurrent workers.


	[1mMSE : [0m126594.4054  [1mMAE : [0m267.6136  [1mR2_score : [0m0.2716


[4m[1mGLOVE[0m[0m



[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:  1.9min


	[1mMSE : [0m139038.7497  [1mMAE : [0m280.809  [1mR2_score : [0m0.2




[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:  4.8min finished
[Parallel(n_jobs=6)]: Using backend ThreadingBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.0s finished


### Neural Networks(regression)

In [110]:
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tqdm import tqdm

In [10]:
def SimpleNN(x_train, x_test, y_train, y_test):
    def NN(x_train):
        x_train = np.array(x_train)
        model = keras.Sequential([
            layers.Dense(128, activation='relu', input_shape=(x_train.shape[1],)),
            layers.Dense(64, activation='relu'),
            layers.Dense(32, activation='relu'),
            layers.Dense(1)  # Single output for regression
        ])
        model.compile(optimizer = 'adam', loss = 'mean_squared_error', metrics = ['mae'])
        return model
    
    x_test = np.array(x_test)
    y_train = y_train*1000
    y_test = y_test*1000
    
    model = NN(x_train)

    early_stopping = EarlyStopping(
        monitor='val_loss',
        patience = 10,
        restore_best_weights = True)
    epochs = 100
    batch_size = 32
    val_split = 0.2

    history = {
        'loss' : [],
        'val_loss' : []
    }

    for epoch in tqdm(range(epochs), desc = 'Training'):
        history_epoch = model.fit(x_train, y_train, epochs = 1, batch_size = batch_size,
                                 validation_split = val_split, verbose = 0, 
                                  callbacks = [early_stopping])

        history['loss'].append(history_epoch.history['loss'][0])
        history['val_loss'].append(history_epoch.history['val_loss'][0])

        if early_stopping.stopped_epoch > 0:
            print(f"Training Stopped at epoch {epoch + 1}")

    y_pred = model.predict(x_test)
    return (mean_squared_error(y_test, y_pred), mean_absolute_error(y_test, y_pred), r2_score(y_test, y_pred))

for tr in ['tfidf', 'ngrams']:
    print(f"\033[4m\033[1m{tr.upper()}\033[0m\033[0m\n")
    
    x_train = df[tr]['x_train']
    x_test = df[tr]['x_test']
    mse, mae, r2 = SimpleNN(x_train, x_test, df['y_train'], df['y_test'])
    print(f"\t\033[1mMSE : \033[0m{round(mse, 4)}  \033[1mMAE : \033[0m{round(mae, 4)}  \033[1mR2_score : \033[0m{round(r2, 4)}\n\n")

[4m[1mTFIDF[0m[0m



Training: 100%|██████████████████████████████████████████████████████████████████████| 100/100 [01:07<00:00,  1.48it/s]


	[1mMSE : [0m117870.3387  [1mMAE : [0m261.383  [1mR2_score : [0m0.3218


[4m[1mNGRAMS[0m[0m



Training: 100%|██████████████████████████████████████████████████████████████████████| 100/100 [02:41<00:00,  1.62s/it]


	[1mMSE : [0m147798.0689  [1mMAE : [0m285.1289  [1mR2_score : [0m0.1496




### LSTM

In [100]:
from tensorflow.keras import layers, models
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer

In [109]:
def load_glove_embeddings(file_path):
    embeddings_index = {}    
    with open(file_path, 'r', encoding = 'utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype = 'float32')
            embeddings_index[word] = coefs
    return embeddings_index

def create_embedding_matrix(tokenizer, embeddings_index, embedding_dim = 300):
    vocabulary_size = len(tokenizer.word_index) + 1
    embedding_matrix = np.zeros((vocabulary_size, embedding_dim))
    
    for word, i in tqdm(tokenizer.word_index.items(), desc = 'Creating Embeddings'):
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    return embedding_matrix

def prepare_sequences(tokenizer, texts, max_length):
    sequences = tokenizer.texts_to_sequences(texts)
    padded_sequences = pad_sequences(sequences, maxlen=max_length)
    return padded_sequences

def LSTMModel(x_train, x_test, y_train, y_test, embedding_matrix, max_length=100):
    model = models.Sequential([
        layers.Embedding(input_dim=embedding_matrix.shape[0], 
                         output_dim=embedding_matrix.shape[1], 
                         weights=[embedding_matrix], 
                         input_length=max_length,
                         trainable=False),
        layers.LSTM(128, return_sequences=True),
        layers.LSTM(64),
        layers.Dense(32, activation='relu'),
        layers.Dense(1)  # Single output for regression
    ])
    
    model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])

    # Convert to numpy arrays and adjust scale as needed
    y_train = y_train * 1000  
    y_test = y_test * 1000
    
    # Early stopping callback
    early_stopping = EarlyStopping(
        monitor='val_loss',
        patience=10,
        restore_best_weights=True
    )
    
    epochs = 100
    batch_size = 32
    val_split = 0.2

    history = {
        'loss': [],
        'val_loss': []
    }

    for epoch in tqdm(range(epochs), desc='Training'):
        history_epoch = model.fit(x_train, y_train, epochs=1, batch_size=batch_size,
                                  validation_split=val_split, verbose=0,
                                  callbacks=[early_stopping])

        history['loss'].append(history_epoch.history['loss'][0])
        history['val_loss'].append(history_epoch.history['val_loss'][0])
        print("Loss : ", history_epoch.history['loss'])

        if early_stopping.stopped_epoch > 0:
            print(f"Training Stopped at epoch {epoch + 1}")
            break

    y_pred = model.predict(x_test)
    return (mean_squared_error(y_test, y_pred), 
            mean_absolute_error(y_test, y_pred), 
            r2_score(y_test, y_pred))

if __name__ == "__main__":
    glove_file_path = 'D:/DATA (D)/Glove/glove.6B/glove.6B.300d.txt'
    glove_embeddings = load_glove_embeddings(glove_file_path)
    
    dataset = pd.read_csv('df.csv')
    df_train = dataset.iloc[:10000, :]
    df_test = dataset.iloc[10000:, :]
    
    train_texts = df_train['article'].values
    test_texts = df_test['article'].values
    
    y_train = df_train['sentiment_score']
    y_test = df_test['sentiment_score']
    
    tokenize = Tokenizer()
    tokenize.fit_on_texts(train_texts)
    
    embedding_matrix = create_embedding_matrix(tokenize, glove_embeddings, embedding_dim = 300)
    
    max_length = 100
    x_train = prepare_sequences(tokenize, train_texts, max_length)
    x_test = prepare_sequences(tokenize, test_texts, max_length)

    result = LSTMModel(x_train, x_test, y_train, y_test, embedding_matrix, max_length)
    print("MSE : ", result[0], "\tMAE : ", result[1], "\tR2 : ", result[2])

Creating Embeddings: 100%|███████████████████████████████████████████████████| 18444/18444 [00:00<00:00, 271365.33it/s]
Training:   1%|▋                                                                     | 1/100 [00:36<1:00:23, 36.60s/it]

Loss :  [213093.515625]



Training:   2%|█▍                                                                      | 2/100 [01:08<55:10, 33.78s/it]

Loss :  [162857.625]



Training:   3%|██▏                                                                     | 3/100 [01:38<52:00, 32.17s/it]

Loss :  [146414.203125]



Training:   4%|██▉                                                                     | 4/100 [02:08<49:58, 31.24s/it]

Loss :  [145583.703125]



Training:   5%|███▌                                                                    | 5/100 [02:38<48:40, 30.75s/it]

Loss :  [145577.65625]



Training:   6%|████▎                                                                   | 6/100 [03:07<47:22, 30.24s/it]

Loss :  [145576.71875]



Training:   7%|█████                                                                   | 7/100 [03:36<46:26, 29.96s/it]

Loss :  [145581.96875]



Training:   8%|█████▊                                                                  | 8/100 [04:06<45:36, 29.75s/it]

Loss :  [145570.890625]



Training:   9%|██████▍                                                                 | 9/100 [04:35<44:57, 29.64s/it]

Loss :  [145556.15625]



Training:  10%|███████                                                                | 10/100 [05:05<44:24, 29.61s/it]

Loss :  [141175.578125]



Training:  11%|███████▊                                                               | 11/100 [05:34<43:52, 29.58s/it]

Loss :  [131832.734375]



Training:  12%|████████▌                                                              | 12/100 [06:04<43:24, 29.60s/it]

Loss :  [112843.375]



Training:  13%|█████████▏                                                             | 13/100 [06:34<43:01, 29.67s/it]

Loss :  [82693.8203125]



Training:  14%|█████████▉                                                             | 14/100 [07:03<42:21, 29.55s/it]

Loss :  [62366.45703125]



Training:  15%|██████████▋                                                            | 15/100 [07:32<41:42, 29.45s/it]

Loss :  [48067.421875]



Training:  16%|███████████▎                                                           | 16/100 [08:02<41:17, 29.50s/it]

Loss :  [37235.00390625]



Training:  17%|████████████                                                           | 17/100 [08:31<40:45, 29.47s/it]

Loss :  [28574.69921875]



Training:  18%|████████████▊                                                          | 18/100 [09:02<40:46, 29.83s/it]

Loss :  [21833.685546875]



Training:  19%|█████████████▍                                                         | 19/100 [09:32<40:19, 29.87s/it]

Loss :  [16978.017578125]



Training:  20%|██████████████▏                                                        | 20/100 [10:02<39:55, 29.94s/it]

Loss :  [13067.1669921875]



Training:  21%|██████████████▉                                                        | 21/100 [10:32<39:20, 29.88s/it]

Loss :  [10234.2890625]



Training:  22%|███████████████▌                                                       | 22/100 [11:02<38:57, 29.97s/it]

Loss :  [8349.462890625]



Training:  23%|████████████████▎                                                      | 23/100 [11:32<38:30, 30.01s/it]

Loss :  [7651.80908203125]



Training:  24%|█████████████████                                                      | 24/100 [12:02<37:58, 29.98s/it]

Loss :  [8100.45068359375]



Training:  25%|█████████████████▊                                                     | 25/100 [12:32<37:39, 30.13s/it]

Loss :  [5358.736328125]



Training:  26%|██████████████████▍                                                    | 26/100 [13:03<37:11, 30.16s/it]

Loss :  [4969.2890625]



Training:  27%|███████████████████▏                                                   | 27/100 [13:33<36:50, 30.29s/it]

Loss :  [4392.05810546875]



Training:  28%|███████████████████▉                                                   | 28/100 [14:04<36:22, 30.31s/it]

Loss :  [4003.49609375]



Training:  29%|████████████████████▌                                                  | 29/100 [14:34<35:51, 30.30s/it]

Loss :  [6936.39013671875]



Training:  30%|█████████████████████▎                                                 | 30/100 [15:05<35:32, 30.47s/it]

Loss :  [4285.1357421875]



Training:  31%|██████████████████████                                                 | 31/100 [15:35<35:02, 30.47s/it]

Loss :  [3982.175537109375]



Training:  32%|██████████████████████▋                                                | 32/100 [16:06<34:32, 30.47s/it]

Loss :  [3564.2822265625]



Training:  33%|███████████████████████▍                                               | 33/100 [16:36<33:56, 30.40s/it]

Loss :  [3168.72412109375]



Training:  34%|████████████████████████▏                                              | 34/100 [17:06<33:19, 30.29s/it]

Loss :  [3021.114501953125]



Training:  35%|████████████████████████▊                                              | 35/100 [17:37<32:58, 30.44s/it]

Loss :  [2845.5126953125]



Training:  36%|█████████████████████████▌                                             | 36/100 [18:07<32:22, 30.35s/it]

Loss :  [2459.060302734375]



Training:  37%|██████████████████████████▎                                            | 37/100 [18:37<31:50, 30.33s/it]

Loss :  [2395.95458984375]



Training:  38%|██████████████████████████▉                                            | 38/100 [19:07<31:21, 30.35s/it]

Loss :  [2086.802978515625]



Training:  39%|███████████████████████████▋                                           | 39/100 [19:38<30:48, 30.30s/it]

Loss :  [7519.81103515625]



Training:  40%|████████████████████████████▍                                          | 40/100 [20:08<30:20, 30.34s/it]

Loss :  [4778.8388671875]



Training:  41%|█████████████████████████████                                          | 41/100 [20:39<29:51, 30.37s/it]

Loss :  [3137.606201171875]



Training:  42%|█████████████████████████████▊                                         | 42/100 [21:09<29:20, 30.35s/it]

Loss :  [2009.9271240234375]



Training:  43%|██████████████████████████████▌                                        | 43/100 [21:40<28:55, 30.44s/it]

Loss :  [1677.490478515625]



Training:  44%|███████████████████████████████▏                                       | 44/100 [22:10<28:24, 30.44s/it]

Loss :  [1537.4849853515625]



Training:  45%|███████████████████████████████▉                                       | 45/100 [22:40<27:50, 30.37s/it]

Loss :  [1414.49658203125]



Training:  46%|████████████████████████████████▋                                      | 46/100 [23:20<29:49, 33.15s/it]

Loss :  [1421.138427734375]



Training:  47%|█████████████████████████████████▎                                     | 47/100 [24:01<31:18, 35.44s/it]

Loss :  [1379.1131591796875]



Training:  48%|██████████████████████████████████                                     | 48/100 [24:41<32:03, 36.99s/it]

Loss :  [1610.4937744140625]



Training:  49%|██████████████████████████████████▊                                    | 49/100 [25:21<32:06, 37.78s/it]

Loss :  [1493.5728759765625]



Training:  50%|███████████████████████████████████▌                                   | 50/100 [26:02<32:13, 38.68s/it]

Loss :  [1233.078857421875]



Training:  51%|████████████████████████████████████▏                                  | 51/100 [26:42<31:55, 39.10s/it]

Loss :  [1041.845947265625]



Training:  52%|████████████████████████████████████▉                                  | 52/100 [27:22<31:37, 39.54s/it]

Loss :  [1343.937255859375]



Training:  53%|█████████████████████████████████████▋                                 | 53/100 [28:03<31:10, 39.80s/it]

Loss :  [2562.07861328125]



Training:  54%|██████████████████████████████████████▎                                | 54/100 [28:43<30:43, 40.07s/it]

Loss :  [1868.9857177734375]



Training:  55%|███████████████████████████████████████                                | 55/100 [29:25<30:21, 40.48s/it]

Loss :  [1305.904541015625]



Training:  56%|███████████████████████████████████████▊                               | 56/100 [30:05<29:43, 40.53s/it]

Loss :  [1101.8558349609375]



Training:  57%|████████████████████████████████████████▍                              | 57/100 [30:45<28:55, 40.37s/it]

Loss :  [1206.8489990234375]



Training:  58%|█████████████████████████████████████████▏                             | 58/100 [31:26<28:13, 40.32s/it]

Loss :  [931.2379150390625]



Training:  59%|█████████████████████████████████████████▉                             | 59/100 [32:06<27:36, 40.39s/it]

Loss :  [784.2937622070312]



Training:  60%|██████████████████████████████████████████▌                            | 60/100 [32:47<26:54, 40.37s/it]

Loss :  [768.3660888671875]



Training:  61%|███████████████████████████████████████████▎                           | 61/100 [33:27<26:13, 40.35s/it]

Loss :  [874.7066650390625]



Training:  62%|████████████████████████████████████████████                           | 62/100 [34:09<25:59, 41.04s/it]

Loss :  [874.6575927734375]



Training:  63%|████████████████████████████████████████████▋                          | 63/100 [34:54<25:58, 42.13s/it]

Loss :  [995.1165161132812]



Training:  64%|█████████████████████████████████████████████▍                         | 64/100 [35:35<25:07, 41.86s/it]

Loss :  [893.3494262695312]



Training:  65%|██████████████████████████████████████████████▏                        | 65/100 [36:16<24:10, 41.44s/it]

Loss :  [806.4595947265625]



Training:  66%|██████████████████████████████████████████████▊                        | 66/100 [36:58<23:31, 41.52s/it]

Loss :  [892.0398559570312]



Training:  67%|████████████████████████████████████████████▏                     | 67/100 [1:25:10<8:13:18, 896.92s/it]

Loss :  [907.6822509765625]



Training:  68%|████████████████████████████████████████████▉                     | 68/100 [1:26:06<5:43:43, 644.47s/it]

Loss :  [765.9224243164062]



Training:  69%|█████████████████████████████████████████████▌                    | 69/100 [1:26:52<4:00:17, 465.09s/it]

Loss :  [708.476806640625]



Training:  70%|██████████████████████████████████████████████▏                   | 70/100 [1:27:37<2:49:32, 339.08s/it]

Loss :  [771.5733032226562]



Training:  71%|██████████████████████████████████████████████▊                   | 71/100 [1:28:20<2:00:50, 250.00s/it]

Loss :  [819.3905029296875]



Training:  72%|███████████████████████████████████████████████▌                  | 72/100 [1:29:06<1:28:08, 188.86s/it]

Loss :  [740.4340209960938]



Training:  73%|████████████████████████████████████████████████▏                 | 73/100 [1:29:52<1:05:40, 145.94s/it]

Loss :  [717.5126953125]



Training:  74%|██████████████████████████████████████████████████▎                 | 74/100 [1:30:37<50:07, 115.68s/it]

Loss :  [656.5509643554688]



Training:  75%|███████████████████████████████████████████████████▊                 | 75/100 [1:31:24<39:41, 95.28s/it]

Loss :  [653.8307495117188]



Training:  76%|████████████████████████████████████████████████████▍                | 76/100 [1:32:10<32:06, 80.26s/it]

Loss :  [641.0252075195312]



Training:  77%|█████████████████████████████████████████████████████▏               | 77/100 [1:32:53<26:31, 69.18s/it]

Loss :  [765.6498413085938]



Training:  78%|█████████████████████████████████████████████████████▊               | 78/100 [1:33:36<22:27, 61.24s/it]

Loss :  [1129.3564453125]



Training:  79%|██████████████████████████████████████████████████████▌              | 79/100 [1:34:18<19:27, 55.61s/it]

Loss :  [1171.5440673828125]



Training:  80%|███████████████████████████████████████████████████████▏             | 80/100 [1:35:07<17:50, 53.54s/it]

Loss :  [760.5636596679688]



Training:  81%|███████████████████████████████████████████████████████▉             | 81/100 [1:35:59<16:49, 53.12s/it]

Loss :  [422.992431640625]



Training:  82%|████████████████████████████████████████████████████████▌            | 82/100 [1:36:50<15:46, 52.58s/it]

Loss :  [281.2176513671875]



Training:  83%|█████████████████████████████████████████████████████████▎           | 83/100 [1:37:40<14:40, 51.77s/it]

Loss :  [215.54794311523438]



Training:  84%|█████████████████████████████████████████████████████████▉           | 84/100 [1:38:29<13:34, 50.92s/it]

Loss :  [220.50643920898438]



Training:  85%|██████████████████████████████████████████████████████████▋          | 85/100 [1:39:19<12:38, 50.56s/it]

Loss :  [268.76837158203125]



Training:  86%|███████████████████████████████████████████████████████████▎         | 86/100 [1:40:07<11:36, 49.74s/it]

Loss :  [398.7582702636719]



Training:  87%|████████████████████████████████████████████████████████████         | 87/100 [1:40:50<10:21, 47.80s/it]

Loss :  [662.351806640625]



Training:  88%|████████████████████████████████████████████████████████████▋        | 88/100 [1:41:36<09:27, 47.32s/it]

Loss :  [1341.8157958984375]



Training:  89%|█████████████████████████████████████████████████████████████▍       | 89/100 [1:42:22<08:35, 46.84s/it]

Loss :  [1449.861572265625]



Training:  90%|██████████████████████████████████████████████████████████████       | 90/100 [1:43:08<07:45, 46.54s/it]

Loss :  [769.54443359375]



Training:  91%|██████████████████████████████████████████████████████████████▊      | 91/100 [1:43:53<06:55, 46.17s/it]

Loss :  [453.4833679199219]



Training:  92%|███████████████████████████████████████████████████████████████▍     | 92/100 [1:44:38<06:05, 45.74s/it]

Loss :  [267.1376037597656]



Training:  93%|████████████████████████████████████████████████████████████████▏    | 93/100 [1:45:23<05:18, 45.55s/it]

Loss :  [206.20172119140625]



Training:  94%|████████████████████████████████████████████████████████████████▊    | 94/100 [1:45:57<04:13, 42.17s/it]

Loss :  [192.84323120117188]



Training:  95%|█████████████████████████████████████████████████████████████████▌   | 95/100 [1:46:30<03:16, 39.29s/it]

Loss :  [233.01678466796875]



Training:  96%|██████████████████████████████████████████████████████████████████▏  | 96/100 [1:47:03<02:30, 37.50s/it]

Loss :  [311.5322570800781]



Training:  97%|██████████████████████████████████████████████████████████████████▉  | 97/100 [1:47:37<01:49, 36.46s/it]

Loss :  [470.0483093261719]



Training:  98%|███████████████████████████████████████████████████████████████████▌ | 98/100 [1:48:21<01:17, 38.77s/it]

Loss :  [961.5517578125]



Training:  99%|████████████████████████████████████████████████████████████████████▎| 99/100 [1:48:58<00:38, 38.17s/it]

Loss :  [1597.838623046875]


Training: 100%|████████████████████████████████████████████████████████████████████| 100/100 [1:49:34<00:00, 65.75s/it]

Loss :  [1298.4388427734375]





MSE :  90750.53364386059 	MAE :  227.2926801374872 	R2 :  0.5149566292298862


In [98]:
dataset = pd.read_csv('df.csv')

#### With best weights

In [112]:
def load_glove_embeddings(file_path):
    embeddings_index = {}    
    with open(file_path, 'r', encoding = 'utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype = 'float32')
            embeddings_index[word] = coefs
    return embeddings_index

def create_embedding_matrix(tokenizer, embeddings_index, embedding_dim = 300):
    vocabulary_size = len(tokenizer.word_index) + 1
    embedding_matrix = np.zeros((vocabulary_size, embedding_dim))
    
    for word, i in tqdm(tokenizer.word_index.items(), desc = 'Creating Embeddings'):
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    return embedding_matrix

def prepare_sequences(tokenizer, texts, max_length):
    sequences = tokenizer.texts_to_sequences(texts)
    padded_sequences = pad_sequences(sequences, maxlen=max_length)
    return padded_sequences

def LSTMModel(x_train, x_test, y_train, y_test, embedding_matrix, max_length=100):
    model = models.Sequential([
        layers.Embedding(input_dim=embedding_matrix.shape[0], 
                         output_dim=embedding_matrix.shape[1], 
                         weights=[embedding_matrix], 
                         input_length=max_length,
                         trainable=False),
        layers.LSTM(128, return_sequences=True),
        layers.LSTM(64),
        layers.Dense(32, activation='relu'),
        layers.Dense(1)  # Single output for regression
    ])
    
    model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])
    
    print(model.summary())
    
    # Convert to numpy arrays and adjust scale as needed
    y_train = y_train * 1000  
    y_test = y_test * 1000
    
    # Early stopping callback
    early_stopping = EarlyStopping(
        monitor='val_loss',
        patience=10,
        restore_best_weights=True
    )
    
    model_checkpoint = ModelCheckpoint('best_model_weights.h5',
                                      save_best_only=True,
                                      save_weights_only=True,
                                      monitor = 'val_loss',
                                      mode = 'min',
                                      verbose = 1)
    
    epochs = 100
    batch_size = 32
    val_split = 0.2

    history = {
        'loss': [],
        'val_loss': []
    }

    for epoch in tqdm(range(epochs), desc='Training'):
        history_epoch = model.fit(x_train, y_train, epochs=1, batch_size=batch_size,
                                  validation_split=val_split, verbose=0,
                                  callbacks=[early_stopping, model_checkpoint])

        history['loss'].append(history_epoch.history['loss'][0])
        history['val_loss'].append(history_epoch.history['val_loss'][0])
        print("Loss : ", history_epoch.history['loss'])

        if early_stopping.stopped_epoch > 0:
            print(f"Training Stopped at epoch {epoch + 1}")
            break
    
    model.load_weights('best_model_weights.h5')

    y_pred = model.predict(x_test)
    return (mean_squared_error(y_test, y_pred), 
            mean_absolute_error(y_test, y_pred), 
            r2_score(y_test, y_pred))

if __name__ == "__main__":
    glove_file_path = 'D:/DATA (D)/Glove/glove.6B/glove.6B.300d.txt'
    glove_embeddings = load_glove_embeddings(glove_file_path)
    
    dataset = pd.read_csv('df.csv')
    df_train = dataset.iloc[:10000, :]
    df_test = dataset.iloc[10000:, :]
    
    train_texts = df_train['article'].values
    test_texts = df_test['article'].values
    
    y_train = df_train['sentiment_score']
    y_test = df_test['sentiment_score']
    
    tokenize = Tokenizer()
    tokenize.fit_on_texts(train_texts)
    
    embedding_matrix = create_embedding_matrix(tokenize, glove_embeddings, embedding_dim = 300)
    
    max_length = 100
    x_train = prepare_sequences(tokenize, train_texts, max_length)
    x_test = prepare_sequences(tokenize, test_texts, max_length)

    result = LSTMModel(x_train, x_test, y_train, y_test, embedding_matrix, max_length)
    print("MSE : ", result[0], "\tMAE : ", result[1], "\tR2 : ", result[2])

Creating Embeddings: 100%|███████████████████████████████████████████████████| 18444/18444 [00:00<00:00, 396287.83it/s]


Model: "sequential_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 100, 300)          5533500   
                                                                 
 lstm_6 (LSTM)               (None, 100, 128)          219648    
                                                                 
 lstm_7 (LSTM)               (None, 64)                49408     
                                                                 
 dense_30 (Dense)            (None, 32)                2080      
                                                                 
 dense_31 (Dense)            (None, 1)                 33        
                                                                 
Total params: 5,804,669
Trainable params: 271,169
Non-trainable params: 5,533,500
_________________________________________________________________
None



Training:   0%|                                                                                | 0/100 [00:00<?, ?it/s]


Epoch 1: val_loss improved from inf to 164810.85938, saving model to best_model_weights.h5



Training:   1%|▋                                                                    | 1/100 [02:46<4:34:40, 166.47s/it]

Loss :  [204525.09375]

Epoch 1: val_loss improved from 164810.85938 to 149475.59375, saving model to best_model_weights.h5



Training:   2%|█▍                                                                   | 2/100 [05:03<4:03:41, 149.20s/it]

Loss :  [153423.921875]

Epoch 1: val_loss did not improve from 149475.59375



Training:   3%|██                                                                   | 3/100 [07:22<3:53:43, 144.57s/it]

Loss :  [145590.765625]

Epoch 1: val_loss did not improve from 149475.59375



Training:   4%|██▊                                                                  | 4/100 [09:46<3:51:01, 144.39s/it]

Loss :  [145571.109375]

Epoch 1: val_loss did not improve from 149475.59375



Training:   5%|███▍                                                                 | 5/100 [11:59<3:42:12, 140.34s/it]

Loss :  [145567.953125]

Epoch 1: val_loss did not improve from 149475.59375



Training:   6%|████▏                                                                | 6/100 [14:10<3:34:38, 137.00s/it]

Loss :  [145591.859375]

Epoch 1: val_loss did not improve from 149475.59375



Training:   7%|████▊                                                                | 7/100 [16:23<3:30:30, 135.82s/it]

Loss :  [145587.21875]

Epoch 1: val_loss did not improve from 149475.59375



Training:   8%|█████▌                                                               | 8/100 [18:40<3:28:35, 136.04s/it]

Loss :  [145543.109375]

Epoch 1: val_loss improved from 149475.59375 to 148317.43750, saving model to best_model_weights.h5



Training:   9%|██████▏                                                              | 9/100 [21:00<3:28:08, 137.24s/it]

Loss :  [141156.734375]

Epoch 1: val_loss improved from 148317.43750 to 139302.40625, saving model to best_model_weights.h5



Training:  10%|██████▊                                                             | 10/100 [23:22<3:28:23, 138.93s/it]

Loss :  [128703.7734375]

Epoch 1: val_loss improved from 139302.40625 to 115556.02344, saving model to best_model_weights.h5



Training:  11%|███████▍                                                            | 11/100 [25:46<3:28:09, 140.34s/it]

Loss :  [101537.8125]

Epoch 1: val_loss improved from 115556.02344 to 108797.00781, saving model to best_model_weights.h5



Training:  12%|████████▏                                                           | 12/100 [28:08<3:26:48, 141.01s/it]

Loss :  [76345.2421875]

Epoch 1: val_loss improved from 108797.00781 to 101378.10938, saving model to best_model_weights.h5



Training:  13%|████████▊                                                           | 13/100 [30:26<3:22:55, 139.95s/it]

Loss :  [60433.73046875]

Epoch 1: val_loss improved from 101378.10938 to 99800.17969, saving model to best_model_weights.h5



Training:  14%|█████████▌                                                          | 14/100 [32:40<3:18:02, 138.17s/it]

Loss :  [48627.8125]

Epoch 1: val_loss improved from 99800.17969 to 88330.88281, saving model to best_model_weights.h5



Training:  15%|██████████▏                                                         | 15/100 [34:55<3:14:14, 137.11s/it]

Loss :  [37443.02734375]

Epoch 1: val_loss improved from 88330.88281 to 86367.57031, saving model to best_model_weights.h5



Training:  16%|██████████▉                                                         | 16/100 [37:08<3:10:30, 136.07s/it]

Loss :  [29705.8671875]

Epoch 1: val_loss did not improve from 86367.57031



Training:  17%|███████████▌                                                        | 17/100 [39:23<3:07:25, 135.49s/it]

Loss :  [23896.697265625]

Epoch 1: val_loss did not improve from 86367.57031



Training:  18%|████████████▏                                                       | 18/100 [41:41<3:06:20, 136.34s/it]

Loss :  [19404.34765625]

Epoch 1: val_loss did not improve from 86367.57031



Training:  19%|████████████▉                                                       | 19/100 [44:10<3:09:25, 140.32s/it]

Loss :  [15870.2958984375]

Epoch 1: val_loss did not improve from 86367.57031



Training:  20%|█████████████▌                                                      | 20/100 [46:32<3:07:47, 140.84s/it]

Loss :  [14184.71484375]

Epoch 1: val_loss did not improve from 86367.57031



Training:  21%|██████████████▎                                                     | 21/100 [48:49<3:03:41, 139.51s/it]

Loss :  [10663.84375]

Epoch 1: val_loss did not improve from 86367.57031



Training:  22%|██████████████▉                                                     | 22/100 [51:12<3:02:35, 140.46s/it]

Loss :  [8857.693359375]

Epoch 1: val_loss did not improve from 86367.57031



Training:  23%|███████████████▋                                                    | 23/100 [53:29<2:59:01, 139.50s/it]

Loss :  [7288.611328125]

Epoch 1: val_loss did not improve from 86367.57031



Training:  24%|████████████████▎                                                   | 24/100 [55:49<2:56:58, 139.71s/it]

Loss :  [6193.869140625]

Epoch 1: val_loss did not improve from 86367.57031



Training:  25%|█████████████████                                                   | 25/100 [58:08<2:54:27, 139.56s/it]

Loss :  [5166.2900390625]

Epoch 1: val_loss did not improve from 86367.57031



Training:  26%|█████████████████▏                                                | 26/100 [1:00:28<2:52:10, 139.61s/it]

Loss :  [4446.55029296875]

Epoch 1: val_loss did not improve from 86367.57031



Training:  27%|█████████████████▊                                                | 27/100 [1:02:51<2:51:07, 140.65s/it]

Loss :  [3804.9951171875]

Epoch 1: val_loss did not improve from 86367.57031



Training:  28%|██████████████████▍                                               | 28/100 [1:05:12<2:48:47, 140.65s/it]

Loss :  [3527.920654296875]

Epoch 1: val_loss did not improve from 86367.57031



Training:  29%|███████████████████▏                                              | 29/100 [1:07:29<2:45:24, 139.78s/it]

Loss :  [3401.916748046875]

Epoch 1: val_loss did not improve from 86367.57031



Training:  30%|███████████████████▊                                              | 30/100 [1:09:46<2:41:52, 138.74s/it]

Loss :  [3131.637939453125]

Epoch 1: val_loss did not improve from 86367.57031



Training:  31%|████████████████████▍                                             | 31/100 [1:12:03<2:39:01, 138.28s/it]

Loss :  [2693.71826171875]

Epoch 1: val_loss did not improve from 86367.57031



Training:  32%|█████████████████████                                             | 32/100 [1:14:27<2:38:32, 139.89s/it]

Loss :  [2594.233154296875]

Epoch 1: val_loss did not improve from 86367.57031



Training:  33%|█████████████████████▊                                            | 33/100 [1:16:49<2:37:03, 140.65s/it]

Loss :  [2438.21875]

Epoch 1: val_loss did not improve from 86367.57031



Training:  34%|██████████████████████▍                                           | 34/100 [1:19:09<2:34:20, 140.30s/it]

Loss :  [2067.539306640625]

Epoch 1: val_loss did not improve from 86367.57031



Training:  35%|███████████████████████                                           | 35/100 [1:21:28<2:31:50, 140.17s/it]

Loss :  [2055.045654296875]

Epoch 1: val_loss did not improve from 86367.57031



Training:  36%|███████████████████████▊                                          | 36/100 [1:23:48<2:29:18, 139.98s/it]

Loss :  [2075.06787109375]

Epoch 1: val_loss did not improve from 86367.57031



Training:  37%|████████████████████████▍                                         | 37/100 [1:26:02<2:25:07, 138.22s/it]

Loss :  [1987.4271240234375]

Epoch 1: val_loss did not improve from 86367.57031



Training:  38%|█████████████████████████                                         | 38/100 [1:28:18<2:22:16, 137.69s/it]

Loss :  [1782.232177734375]

Epoch 1: val_loss did not improve from 86367.57031



Training:  39%|█████████████████████████▋                                        | 39/100 [1:30:35<2:19:34, 137.29s/it]

Loss :  [1559.313720703125]

Epoch 1: val_loss did not improve from 86367.57031



Training:  40%|██████████████████████████▍                                       | 40/100 [1:32:51<2:16:58, 136.98s/it]

Loss :  [1512.7606201171875]

Epoch 1: val_loss did not improve from 86367.57031



Training:  41%|███████████████████████████                                       | 41/100 [1:35:08<2:14:43, 137.01s/it]

Loss :  [1507.88232421875]

Epoch 1: val_loss did not improve from 86367.57031



Training:  42%|███████████████████████████▋                                      | 42/100 [1:37:26<2:12:32, 137.11s/it]

Loss :  [1473.4906005859375]

Epoch 1: val_loss did not improve from 86367.57031



Training:  43%|████████████████████████████▍                                     | 43/100 [1:39:43<2:10:25, 137.29s/it]

Loss :  [1349.80712890625]

Epoch 1: val_loss did not improve from 86367.57031



Training:  44%|█████████████████████████████                                     | 44/100 [1:42:00<2:07:58, 137.12s/it]

Loss :  [1331.435302734375]

Epoch 1: val_loss did not improve from 86367.57031



Training:  45%|█████████████████████████████▋                                    | 45/100 [1:44:18<2:05:57, 137.42s/it]

Loss :  [1204.3304443359375]

Epoch 1: val_loss improved from 86367.57031 to 86227.64844, saving model to best_model_weights.h5



Training:  46%|██████████████████████████████▎                                   | 46/100 [1:46:36<2:03:48, 137.56s/it]

Loss :  [1255.2337646484375]

Epoch 1: val_loss did not improve from 86227.64844



Training:  47%|███████████████████████████████                                   | 47/100 [1:48:54<2:01:39, 137.72s/it]

Loss :  [1198.542724609375]

Epoch 1: val_loss improved from 86227.64844 to 85572.21094, saving model to best_model_weights.h5



Training:  48%|███████████████████████████████▋                                  | 48/100 [1:51:12<1:59:18, 137.65s/it]

Loss :  [1318.28564453125]

Epoch 1: val_loss did not improve from 85572.21094



Training:  49%|████████████████████████████████▎                                 | 49/100 [1:53:29<1:56:58, 137.61s/it]

Loss :  [1395.0479736328125]

Epoch 1: val_loss did not improve from 85572.21094



Training:  50%|█████████████████████████████████                                 | 50/100 [1:55:47<1:54:39, 137.59s/it]

Loss :  [1529.51416015625]

Epoch 1: val_loss did not improve from 85572.21094



Training:  51%|█████████████████████████████████▋                                | 51/100 [1:58:03<1:52:10, 137.36s/it]

Loss :  [1268.326904296875]

Epoch 1: val_loss improved from 85572.21094 to 85444.23438, saving model to best_model_weights.h5



Training:  52%|██████████████████████████████████▎                               | 52/100 [2:00:21<1:49:58, 137.46s/it]

Loss :  [1010.0357055664062]

Epoch 1: val_loss improved from 85444.23438 to 83660.32812, saving model to best_model_weights.h5



Training:  53%|██████████████████████████████████▉                               | 53/100 [2:02:38<1:47:35, 137.35s/it]

Loss :  [961.568115234375]

Epoch 1: val_loss did not improve from 83660.32812



Training:  54%|███████████████████████████████████▋                              | 54/100 [2:04:56<1:45:21, 137.42s/it]

Loss :  [936.3602294921875]

Epoch 1: val_loss did not improve from 83660.32812



Training:  55%|████████████████████████████████████▎                             | 55/100 [2:07:14<1:43:12, 137.60s/it]

Loss :  [858.1740112304688]

Epoch 1: val_loss did not improve from 83660.32812



Training:  56%|████████████████████████████████████▉                             | 56/100 [2:09:47<1:44:13, 142.13s/it]

Loss :  [929.5484008789062]

Epoch 1: val_loss did not improve from 83660.32812



Training:  57%|█████████████████████████████████████▌                            | 57/100 [2:12:14<1:42:55, 143.63s/it]

Loss :  [1573.233642578125]

Epoch 1: val_loss did not improve from 83660.32812



Training:  58%|██████████████████████████████████████▎                           | 58/100 [2:14:33<1:39:35, 142.28s/it]

Loss :  [1431.0518798828125]

Epoch 1: val_loss did not improve from 83660.32812



Training:  59%|██████████████████████████████████████▉                           | 59/100 [2:16:50<1:36:09, 140.71s/it]

Loss :  [1001.7313232421875]

Epoch 1: val_loss did not improve from 83660.32812



Training:  60%|███████████████████████████████████████▌                          | 60/100 [2:19:04<1:32:28, 138.70s/it]

Loss :  [712.3018798828125]

Epoch 1: val_loss did not improve from 83660.32812



Training:  61%|████████████████████████████████████████▎                         | 61/100 [2:21:22<1:30:07, 138.66s/it]

Loss :  [551.84228515625]

Epoch 1: val_loss did not improve from 83660.32812



Training:  62%|████████████████████████████████████████▉                         | 62/100 [2:23:39<1:27:30, 138.17s/it]

Loss :  [438.04388427734375]

Epoch 1: val_loss did not improve from 83660.32812



Training:  63%|█████████████████████████████████████████▌                        | 63/100 [2:25:54<1:24:26, 136.94s/it]

Loss :  [674.9714965820312]

Epoch 1: val_loss did not improve from 83660.32812



Training:  64%|██████████████████████████████████████████▏                       | 64/100 [2:28:08<1:21:42, 136.19s/it]

Loss :  [847.5696411132812]

Epoch 1: val_loss did not improve from 83660.32812



Training:  65%|██████████████████████████████████████████▉                       | 65/100 [2:30:22<1:18:58, 135.40s/it]

Loss :  [884.320556640625]

Epoch 1: val_loss did not improve from 83660.32812



Training:  66%|███████████████████████████████████████████▌                      | 66/100 [2:32:33<1:16:03, 134.23s/it]

Loss :  [987.8463134765625]

Epoch 1: val_loss did not improve from 83660.32812



Training:  67%|████████████████████████████████████████████▏                     | 67/100 [2:34:43<1:13:07, 132.95s/it]

Loss :  [1004.131103515625]

Epoch 1: val_loss did not improve from 83660.32812



Training:  68%|████████████████████████████████████████████▉                     | 68/100 [2:36:54<1:10:38, 132.44s/it]

Loss :  [896.67138671875]

Epoch 1: val_loss improved from 83660.32812 to 83594.35156, saving model to best_model_weights.h5



Training:  69%|█████████████████████████████████████████████▌                    | 69/100 [2:39:07<1:08:30, 132.58s/it]

Loss :  [782.5051879882812]

Epoch 1: val_loss improved from 83594.35156 to 83135.61719, saving model to best_model_weights.h5



Training:  70%|██████████████████████████████████████████████▏                   | 70/100 [2:41:18<1:06:03, 132.13s/it]

Loss :  [691.01904296875]

Epoch 1: val_loss improved from 83135.61719 to 82980.90625, saving model to best_model_weights.h5



Training:  71%|██████████████████████████████████████████████▊                   | 71/100 [2:43:29<1:03:41, 131.78s/it]

Loss :  [585.2443237304688]

Epoch 1: val_loss did not improve from 82980.90625



Training:  72%|███████████████████████████████████████████████▌                  | 72/100 [2:45:42<1:01:35, 131.96s/it]

Loss :  [507.5086364746094]

Epoch 1: val_loss improved from 82980.90625 to 82730.18750, saving model to best_model_weights.h5



Training:  73%|█████████████████████████████████████████████████▋                  | 73/100 [2:47:52<59:13, 131.59s/it]

Loss :  [420.79290771484375]

Epoch 1: val_loss did not improve from 82730.18750



Training:  74%|██████████████████████████████████████████████████▎                 | 74/100 [2:50:07<57:22, 132.39s/it]

Loss :  [432.22930908203125]

Epoch 1: val_loss did not improve from 82730.18750



Training:  75%|███████████████████████████████████████████████████                 | 75/100 [2:52:25<55:55, 134.22s/it]

Loss :  [540.0759887695312]

Epoch 1: val_loss did not improve from 82730.18750



Training:  76%|███████████████████████████████████████████████████▋                | 76/100 [2:54:43<54:04, 135.21s/it]

Loss :  [1009.6923828125]

Epoch 1: val_loss did not improve from 82730.18750



Training:  77%|████████████████████████████████████████████████████▎               | 77/100 [2:56:55<51:28, 134.27s/it]

Loss :  [1442.4083251953125]

Epoch 1: val_loss did not improve from 82730.18750



Training:  78%|█████████████████████████████████████████████████████               | 78/100 [2:59:14<49:44, 135.67s/it]

Loss :  [1577.0013427734375]

Epoch 1: val_loss improved from 82730.18750 to 82662.65625, saving model to best_model_weights.h5



Training:  79%|█████████████████████████████████████████████████████▋              | 79/100 [3:01:28<47:18, 135.18s/it]

Loss :  [918.1226196289062]

Epoch 1: val_loss improved from 82662.65625 to 82517.15625, saving model to best_model_weights.h5



Training:  80%|██████████████████████████████████████████████████████▍             | 80/100 [3:03:44<45:11, 135.60s/it]

Loss :  [490.9365234375]

Epoch 1: val_loss improved from 82517.15625 to 82142.89062, saving model to best_model_weights.h5



Training:  81%|███████████████████████████████████████████████████████             | 81/100 [3:05:57<42:42, 134.85s/it]

Loss :  [288.3812255859375]

Epoch 1: val_loss did not improve from 82142.89062



Training:  82%|███████████████████████████████████████████████████████▊            | 82/100 [3:08:16<40:49, 136.11s/it]

Loss :  [193.2667999267578]

Epoch 1: val_loss did not improve from 82142.89062



Training:  83%|████████████████████████████████████████████████████████▍           | 83/100 [3:10:37<38:54, 137.35s/it]

Loss :  [214.56634521484375]

Epoch 1: val_loss did not improve from 82142.89062



Training:  84%|█████████████████████████████████████████████████████████           | 84/100 [3:12:56<36:48, 138.01s/it]

Loss :  [234.5863800048828]

Epoch 1: val_loss did not improve from 82142.89062



Training:  85%|█████████████████████████████████████████████████████████▊          | 85/100 [3:15:10<34:13, 136.91s/it]

Loss :  [304.48101806640625]

Epoch 1: val_loss improved from 82142.89062 to 82094.69531, saving model to best_model_weights.h5



Training:  86%|██████████████████████████████████████████████████████████▍         | 86/100 [3:17:25<31:45, 136.08s/it]

Loss :  [419.77056884765625]

Epoch 1: val_loss did not improve from 82094.69531



Training:  87%|███████████████████████████████████████████████████████████▏        | 87/100 [3:19:35<29:07, 134.41s/it]

Loss :  [782.6936645507812]

Epoch 1: val_loss did not improve from 82094.69531



Training:  88%|███████████████████████████████████████████████████████████▊        | 88/100 [3:21:48<26:47, 133.92s/it]

Loss :  [1215.8460693359375]

Epoch 1: val_loss did not improve from 82094.69531



Training:  89%|████████████████████████████████████████████████████████████▌       | 89/100 [3:24:00<24:26, 133.34s/it]

Loss :  [1128.640380859375]

Epoch 1: val_loss did not improve from 82094.69531



Training:  90%|█████████████████████████████████████████████████████████████▏      | 90/100 [3:26:15<22:18, 133.86s/it]

Loss :  [678.03955078125]

Epoch 1: val_loss did not improve from 82094.69531



Training:  91%|█████████████████████████████████████████████████████████████▉      | 91/100 [3:28:28<20:01, 133.54s/it]

Loss :  [653.5504760742188]

Epoch 1: val_loss improved from 82094.69531 to 81284.28906, saving model to best_model_weights.h5



Training:  92%|██████████████████████████████████████████████████████████████▌     | 92/100 [3:30:40<17:45, 133.17s/it]

Loss :  [825.0529174804688]

Epoch 1: val_loss did not improve from 81284.28906



Training:  93%|███████████████████████████████████████████████████████████████▏    | 93/100 [3:32:54<15:33, 133.37s/it]

Loss :  [533.0183715820312]

Epoch 1: val_loss did not improve from 81284.28906



Training:  94%|███████████████████████████████████████████████████████████████▉    | 94/100 [3:35:05<13:15, 132.67s/it]

Loss :  [274.89886474609375]

Epoch 1: val_loss did not improve from 81284.28906



Training:  95%|████████████████████████████████████████████████████████████████▌   | 95/100 [3:37:16<11:01, 132.23s/it]

Loss :  [183.92591857910156]

Epoch 1: val_loss did not improve from 81284.28906



Training:  96%|█████████████████████████████████████████████████████████████████▎  | 96/100 [3:39:32<08:53, 133.25s/it]

Loss :  [151.6776885986328]

Epoch 1: val_loss did not improve from 81284.28906



Training:  97%|█████████████████████████████████████████████████████████████████▉  | 97/100 [3:41:51<06:44, 134.96s/it]

Loss :  [143.62461853027344]

Epoch 1: val_loss did not improve from 81284.28906



Training:  98%|██████████████████████████████████████████████████████████████████▋ | 98/100 [3:44:08<04:31, 135.51s/it]

Loss :  [184.63235473632812]

Epoch 1: val_loss did not improve from 81284.28906



Training:  99%|███████████████████████████████████████████████████████████████████▎| 99/100 [3:46:18<02:13, 133.93s/it]

Loss :  [310.6312561035156]

Epoch 1: val_loss did not improve from 81284.28906



Training: 100%|███████████████████████████████████████████████████████████████████| 100/100 [3:48:25<00:00, 132.05s/it]

Loss :  [1012.8612060546875]


Training: 100%|███████████████████████████████████████████████████████████████████| 100/100 [3:48:26<00:00, 137.06s/it]


MSE :  89491.47586404636 	MAE :  226.46459103309564 	R2 :  0.5216860401214192


In [113]:
import pickle as pkl

In [None]:
with open('lstm.pkl', 'wb') as file:
    file.dump(model)