# Install tpot only if not there

In [3]:
pip install tpot

Collecting tpot
  Downloading TPOT-0.12.0-py3-none-any.whl (87 kB)
     ---------------------------------------- 87.4/87.4 kB 2.5 MB/s eta 0:00:00
Collecting update-checker>=0.16
  Downloading update_checker-0.18.0-py3-none-any.whl (7.0 kB)
Collecting deap>=1.2
  Downloading deap-1.3.3-cp39-cp39-win_amd64.whl (114 kB)
     ---------------------------------------- 114.3/114.3 kB ? eta 0:00:00
Collecting stopit>=1.1.1
  Downloading stopit-1.1.2.tar.gz (18 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting xgboost>=1.1.0
  Downloading xgboost-1.7.5-py3-none-win_amd64.whl (70.9 MB)
     ---------------------------------------- 70.9/70.9 MB 7.0 MB/s eta 0:00:00
Building wheels for collected packages: stopit
  Building wheel for stopit (setup.py): started
  Building wheel for stopit (setup.py): finished with status 'done'
  Created wheel for stopit: filename=stopit-1.1.2-py3-none-any.whl size=11939 sha256=b2b7491679265e7a8cdb3

## Ignoring warnings

In [36]:
import warnings

# Ignore warnings
warnings.filterwarnings("ignore")

# Import packages

In [37]:
from tpot import TPOTClassifier
import re
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import get_scorer_names, get_scorer
from datetime import datetime, timedelta
import nltk
import numpy as np

from nltk.sentiment import SentimentIntensityAnalyzer
from sklearn.preprocessing import MinMaxScaler

# Read comapny tickertape data tsv file created after webscraping

In [38]:
data = pd.read_csv('companyList.tsv', delimiter='\t',encoding='utf-8')

In [39]:
data.head()

Unnamed: 0,sNo,Company Name,Full Company Name,sc Performance,sc Valuation,sc Growth,sc Profitability,sc Entry Point,sc Red Flags,Income Statements,...,2020 PBT,2021 Total Revenue,2021 EBITDA,2021 Net Income,2021 PBT,2022 Total Revenue,2022 EBITDA,2022 Net Income,2022 PBT,PE Ratio
0,1,ABFRL,Aditya Birla Fashion and Retail Ltd,Low Hasn't faired well - amongst the low perf...,High Seems to be overvalued vs the market ave...,Low Lagging behind the market in financials g...,High Showing good signs of profitability & ef...,Avg The stock is not in the overbought zone,Low No red flag found,Higher than Industry Revenue GrowthA higher-th...,...,-32.89,5322.32,655.29,-672.51,-837.54,8239.11,1241.33,-108.72,-144.93,-522.45
1,2,AFFLE,Affle (India) Ltd,Low Hasn't faired well - amongst the low perf...,High Seems to be overvalued vs the market ave...,High Strong financials and growth story over ...,High Showing good signs of profitability & ef...,BadThe stock is overpriced and in the overboug...,Low No red flag found,Higher than Industry Revenue GrowthA higher-th...,...,79.23,558.0,171.2,134.81,147.92,1153.34,284.33,213.9,244.81,55.23
2,3,AARTIDRUGS,Aarti Drugs Ltd,"Avg Price return has been average, nothing ex...",High Seems to be overvalued vs the market ave...,Low Lagging behind the market in financials g...,High Showing good signs of profitability & ef...,Good The stock is underpriced and is not in t...,Low No red flag found,Higher than Industry Revenue GrowthA higher-th...,...,185.3,2162.57,445.12,280.41,369.01,2499.97,343.92,205.04,269.96,26.2
3,4,AARTIIND,Aarti Industries Ltd,Low Hasn't faired well - amongst the low perf...,High Seems to be overvalued vs the market ave...,Low Lagging behind the market in financials g...,High Showing good signs of profitability & ef...,Avg The stock is overpriced but is not in the...,Low No red flag found,Higher than Industry Revenue GrowthA higher-th...,...,676.18,4506.8,982.23,523.47,664.55,7000.76,1929.61,1307.19,1526.8,33.66
4,5,ACC,ACC Ltd,Low Hasn't faired well - amongst the low perf...,High Seems to be overvalued vs the market ave...,Low Lagging behind the market in financials g...,High Showing good signs of profitability & ef...,Avg The stock is overpriced but is not in the...,Low No red flag found,Lower than Industry Revenue GrowthA higher-tha...,...,1708.85,16370.03,3161.68,1862.99,2506.38,17656.07,1597.86,649.44,871.25,70.59


# Creating Sentiment based on score card and writing it to a file

In [40]:
# Step 1: Load and preprocess the data
data = pd.read_csv('companyList.tsv', delimiter='\t',encoding='utf-8')  # Assuming you have a CSV file with sentiment data


# Create a new column for sentiment labels
data['Entry_point_Sentiment'] = ''
data['Growth_Sentiment']= ''
data['Profitability_Sentiment']= ''

# Function to assign sentiment labels based on keywords
def assign_sentiment_label(text):
    lowercase_text = text.lower()
    if 'high' in lowercase_text:
        return 2
    elif 'avg' in lowercase_text:
        return 1
    else:
        return 0

def redflag_valuation_sentiment_label(text):
    lowercase_text = text.lower()
    if 'high' in lowercase_text:
        return 0
    elif 'avg' in lowercase_text:
        return 1
    else:
        return 2

def entrypoint_sentiment_label(text):
    lowercase_text = text.lower()
    if 'bad' in lowercase_text:
        return 0
    elif 'good' in lowercase_text:
        return 2
    else:
        return 1
data['Entry_point_Sentiment'] = data['sc Entry Point'].apply(entrypoint_sentiment_label)
data['Growth_Sentiment'] = data['sc Growth'].apply(assign_sentiment_label)
data['Profitability_Sentiment'] = data['sc Profitability'].apply(assign_sentiment_label)
data['RedFlag_Sentiment'] = data['sc Red Flags'].apply(redflag_valuation_sentiment_label)
data['Performance_Sentiment'] = data['sc Performance'].apply(assign_sentiment_label)
data['Valuation_Sentiment'] = data['sc Valuation'].apply(redflag_valuation_sentiment_label)


# Save the updated dataset
data.to_csv('companyList_sentiment.csv', index=False)

In [41]:
data.head()

Unnamed: 0,sNo,Company Name,Full Company Name,sc Performance,sc Valuation,sc Growth,sc Profitability,sc Entry Point,sc Red Flags,Income Statements,...,2022 EBITDA,2022 Net Income,2022 PBT,PE Ratio,Entry_point_Sentiment,Growth_Sentiment,Profitability_Sentiment,RedFlag_Sentiment,Performance_Sentiment,Valuation_Sentiment
0,1,ABFRL,Aditya Birla Fashion and Retail Ltd,Low Hasn't faired well - amongst the low perf...,High Seems to be overvalued vs the market ave...,Low Lagging behind the market in financials g...,High Showing good signs of profitability & ef...,Avg The stock is not in the overbought zone,Low No red flag found,Higher than Industry Revenue GrowthA higher-th...,...,1241.33,-108.72,-144.93,-522.45,1,0,2,2,0,0
1,2,AFFLE,Affle (India) Ltd,Low Hasn't faired well - amongst the low perf...,High Seems to be overvalued vs the market ave...,High Strong financials and growth story over ...,High Showing good signs of profitability & ef...,BadThe stock is overpriced and in the overboug...,Low No red flag found,Higher than Industry Revenue GrowthA higher-th...,...,284.33,213.9,244.81,55.23,0,2,2,2,0,0
2,3,AARTIDRUGS,Aarti Drugs Ltd,"Avg Price return has been average, nothing ex...",High Seems to be overvalued vs the market ave...,Low Lagging behind the market in financials g...,High Showing good signs of profitability & ef...,Good The stock is underpriced and is not in t...,Low No red flag found,Higher than Industry Revenue GrowthA higher-th...,...,343.92,205.04,269.96,26.2,2,0,2,2,1,0
3,4,AARTIIND,Aarti Industries Ltd,Low Hasn't faired well - amongst the low perf...,High Seems to be overvalued vs the market ave...,Low Lagging behind the market in financials g...,High Showing good signs of profitability & ef...,Avg The stock is overpriced but is not in the...,Low No red flag found,Higher than Industry Revenue GrowthA higher-th...,...,1929.61,1307.19,1526.8,33.66,1,0,2,2,0,0
4,5,ACC,ACC Ltd,Low Hasn't faired well - amongst the low perf...,High Seems to be overvalued vs the market ave...,Low Lagging behind the market in financials g...,High Showing good signs of profitability & ef...,Avg The stock is overpriced but is not in the...,Low No red flag found,Lower than Industry Revenue GrowthA higher-tha...,...,1597.86,649.44,871.25,70.59,1,0,2,2,0,0


# Creating Avg Sentiment based on all scorecards

In [42]:
data['Sentiment']=(data['Entry_point_Sentiment']+data['Growth_Sentiment']+data['Profitability_Sentiment']+data['Performance_Sentiment']+data['RedFlag_Sentiment']+data['Valuation_Sentiment'])/6
print(data['Sentiment'])
data.to_csv('companyList_avg_sentiment.csv', index=False)

0      0.833333
1      1.000000
2      1.166667
3      0.833333
4      0.833333
         ...   
492    1.000000
493    0.833333
494    0.833333
495    0.833333
496    0.333333
Name: Sentiment, Length: 497, dtype: float64


# Removing special characters & cleaning data

In [43]:
data = data.replace("NAN", 0)
data = data.fillna(0)

In [44]:
data = pd.read_csv('companyList_avg_sentiment.csv')
data.head()
def remove_special_characters(text):
    # Remove special characters and punctuation using regular expressions
    cleaned_text = re.sub(r'[^\d\s.-]', '', text)
    return cleaned_text

data['2019 Total Revenue']= data['2019 Total Revenue'].apply(remove_special_characters)
data['2019 EBITDA']= data['2019 EBITDA'].apply(remove_special_characters)
data['2019 Net Income']= data['2019 Net Income'].apply(remove_special_characters)
data['2019 PBT']= data['2019 PBT'].apply(remove_special_characters)
data['2020 Total Revenue']= data['2020 Total Revenue'].apply(remove_special_characters)
data['2020 EBITDA']= data['2020 EBITDA'].apply(remove_special_characters)
data['2020 Net Income']= data['2020 Net Income'].apply(remove_special_characters)
data['2020 PBT']= data['2020 PBT'].apply(remove_special_characters)
data['2021 Total Revenue']= data['2021 Total Revenue'].apply(remove_special_characters)
data['2021 EBITDA']= data['2021 EBITDA'].apply(remove_special_characters)
data['2021 Net Income']= data['2021 Net Income'].apply(remove_special_characters)
data['2021 PBT']= data['2021 PBT'].apply(remove_special_characters)
data['2022 Total Revenue']= data['2022 Total Revenue'].apply(remove_special_characters)
data['2022 EBITDA']= data['2022 EBITDA'].apply(remove_special_characters)
data['2022 Net Income']= data['2022 Net Income'].apply(remove_special_characters)
data['2022 PBT']= data['2022 PBT'].apply(remove_special_characters)
data['PE Ratio']= data['PE Ratio'].apply(remove_special_characters)




data.to_csv('companyList_avg_sentiment_clean.csv', index=False)

In [45]:
newsdata = pd.read_csv('companyNewsList.tsv', delimiter='\t',encoding='utf-8')

In [46]:
newsdata.head()

Unnamed: 0,sNo,Company Name,Full Company Name,News Title,News Article,News Summary,NewsDate
0,4,AARTIIND,Aarti Industries Ltd,Trade setup for Wednesday: Top 15 things to kn...,The recovery in late trade amid consolidation ...,"This was followed by 18,800 strike comprising ...",5 days ago
1,5,ACC,ACC Ltd,"Who Is Nitin Agarwal, New DG Of BSF And Kerala...",After lying vacant for five months post the re...,After lying vacant for five months post the re...,12 hours ago
2,5,ACC,ACC Ltd,India-Pakistan World Cup match in Ahmedabad a ...,The 'hybrid' Asia Cup model is likely to be ap...,"""Oman Cricket board chief Pankaj Khimji, one o...",1 day ago
3,5,ACC,ACC Ltd,ACC Likely To Go Ahead With PCB’s Hybrid Model...,Asia Cup trophy. | (Credits: Twitter)\n\nThe A...,Asia Cup trophy.\n| (Credits: Twitter)The Asia...,1 day ago
4,6,ADANIGREEN,Adani Green Energy Ltd,NSE revises price bands on three Adani Group c...,National Stock Exchange (NSE) has revised pric...,National Stock Exchange (NSE) has revised pric...,5 days ago


In [47]:
def convert_relative_time(relative_time):
    pattern = r'(\d+)\s+(\w+)\s+ago'
    match = re.search(pattern, relative_time)
    if match:
        count = int(match.group(1))
        unit = match.group(2)
        if unit == 'days':
            return timedelta(days=count)
        elif unit == 'hours':
            return timedelta(hours=count)
        elif unit == 'minutes':
            return timedelta(minutes=count)
    return timedelta()

In [48]:
newsdata['NewsDate']= datetime.now() - newsdata['NewsDate'].apply(convert_relative_time)

In [49]:

sia = SentimentIntensityAnalyzer()

#filtered_news = newsdata[newsdata['NewsDate'] > pd.Timestamp.now() - pd.Timedelta(days=2)]
filtered_news = newsdata[newsdata['NewsDate'] > pd.Timestamp.now() - pd.Timedelta]
filtered_news['News Sentiment Score'] = filtered_news['News Article'].astype(str).apply(lambda x: sia.polarity_scores(x)['compound'])
avg_sentiment = filtered_news.groupby('Company Name')['News Sentiment Score'].mean()

In [50]:
avg_sentiment.to_frame()

Unnamed: 0_level_0,News Sentiment Score
Company Name,Unnamed: 1_level_1
ABB,0.990000
ACC,0.960767
ADANIENT,0.996200
ADANIPORTS,0.989200
ADANIPOWER,0.564267
...,...
VEDL,0.929067
VGUARD,0.964500
WIPRO,0.985233
YESBANK,0.664400


In [51]:
data.head()

Unnamed: 0,sNo,Company Name,Full Company Name,sc Performance,sc Valuation,sc Growth,sc Profitability,sc Entry Point,sc Red Flags,Income Statements,...,2022 Net Income,2022 PBT,PE Ratio,Entry_point_Sentiment,Growth_Sentiment,Profitability_Sentiment,RedFlag_Sentiment,Performance_Sentiment,Valuation_Sentiment,Sentiment
0,1,ABFRL,Aditya Birla Fashion and Retail Ltd,Low Hasn't faired well - amongst the low perf...,High Seems to be overvalued vs the market ave...,Low Lagging behind the market in financials g...,High Showing good signs of profitability & ef...,Avg The stock is not in the overbought zone,Low No red flag found,Higher than Industry Revenue GrowthA higher-th...,...,-108.72,-144.93,-522.45,1,0,2,2,0,0,0.833333
1,2,AFFLE,Affle (India) Ltd,Low Hasn't faired well - amongst the low perf...,High Seems to be overvalued vs the market ave...,High Strong financials and growth story over ...,High Showing good signs of profitability & ef...,BadThe stock is overpriced and in the overboug...,Low No red flag found,Higher than Industry Revenue GrowthA higher-th...,...,213.9,244.81,55.23,0,2,2,2,0,0,1.0
2,3,AARTIDRUGS,Aarti Drugs Ltd,"Avg Price return has been average, nothing ex...",High Seems to be overvalued vs the market ave...,Low Lagging behind the market in financials g...,High Showing good signs of profitability & ef...,Good The stock is underpriced and is not in t...,Low No red flag found,Higher than Industry Revenue GrowthA higher-th...,...,205.04,269.96,26.2,2,0,2,2,1,0,1.166667
3,4,AARTIIND,Aarti Industries Ltd,Low Hasn't faired well - amongst the low perf...,High Seems to be overvalued vs the market ave...,Low Lagging behind the market in financials g...,High Showing good signs of profitability & ef...,Avg The stock is overpriced but is not in the...,Low No red flag found,Higher than Industry Revenue GrowthA higher-th...,...,1307.19,1526.8,33.66,1,0,2,2,0,0,0.833333
4,5,ACC,ACC Ltd,Low Hasn't faired well - amongst the low perf...,High Seems to be overvalued vs the market ave...,Low Lagging behind the market in financials g...,High Showing good signs of profitability & ef...,Avg The stock is overpriced but is not in the...,Low No red flag found,Lower than Industry Revenue GrowthA higher-tha...,...,649.44,871.25,70.59,1,0,2,2,0,0,0.833333


In [52]:
newsdata.head()

Unnamed: 0,sNo,Company Name,Full Company Name,News Title,News Article,News Summary,NewsDate
0,4,AARTIIND,Aarti Industries Ltd,Trade setup for Wednesday: Top 15 things to kn...,The recovery in late trade amid consolidation ...,"This was followed by 18,800 strike comprising ...",2023-06-07 21:26:35.012637
1,5,ACC,ACC Ltd,"Who Is Nitin Agarwal, New DG Of BSF And Kerala...",After lying vacant for five months post the re...,After lying vacant for five months post the re...,2023-06-12 09:26:35.012637
2,5,ACC,ACC Ltd,India-Pakistan World Cup match in Ahmedabad a ...,The 'hybrid' Asia Cup model is likely to be ap...,"""Oman Cricket board chief Pankaj Khimji, one o...",2023-06-12 21:26:35.012637
3,5,ACC,ACC Ltd,ACC Likely To Go Ahead With PCB’s Hybrid Model...,Asia Cup trophy. | (Credits: Twitter)\n\nThe A...,Asia Cup trophy.\n| (Credits: Twitter)The Asia...,2023-06-12 21:26:35.012637
4,6,ADANIGREEN,Adani Green Energy Ltd,NSE revises price bands on three Adani Group c...,National Stock Exchange (NSE) has revised pric...,National Stock Exchange (NSE) has revised pric...,2023-06-07 21:26:35.012637


In [53]:
combined_data = data.merge(avg_sentiment, on='Company Name', how='outer')

In [54]:
combined_data.head()

Unnamed: 0,sNo,Company Name,Full Company Name,sc Performance,sc Valuation,sc Growth,sc Profitability,sc Entry Point,sc Red Flags,Income Statements,...,2022 PBT,PE Ratio,Entry_point_Sentiment,Growth_Sentiment,Profitability_Sentiment,RedFlag_Sentiment,Performance_Sentiment,Valuation_Sentiment,Sentiment,News Sentiment Score
0,1.0,ABFRL,Aditya Birla Fashion and Retail Ltd,Low Hasn't faired well - amongst the low perf...,High Seems to be overvalued vs the market ave...,Low Lagging behind the market in financials g...,High Showing good signs of profitability & ef...,Avg The stock is not in the overbought zone,Low No red flag found,Higher than Industry Revenue GrowthA higher-th...,...,-144.93,-522.45,1.0,0.0,2.0,2.0,0.0,0.0,0.833333,
1,2.0,AFFLE,Affle (India) Ltd,Low Hasn't faired well - amongst the low perf...,High Seems to be overvalued vs the market ave...,High Strong financials and growth story over ...,High Showing good signs of profitability & ef...,BadThe stock is overpriced and in the overboug...,Low No red flag found,Higher than Industry Revenue GrowthA higher-th...,...,244.81,55.23,0.0,2.0,2.0,2.0,0.0,0.0,1.0,
2,3.0,AARTIDRUGS,Aarti Drugs Ltd,"Avg Price return has been average, nothing ex...",High Seems to be overvalued vs the market ave...,Low Lagging behind the market in financials g...,High Showing good signs of profitability & ef...,Good The stock is underpriced and is not in t...,Low No red flag found,Higher than Industry Revenue GrowthA higher-th...,...,269.96,26.2,2.0,0.0,2.0,2.0,1.0,0.0,1.166667,
3,4.0,AARTIIND,Aarti Industries Ltd,Low Hasn't faired well - amongst the low perf...,High Seems to be overvalued vs the market ave...,Low Lagging behind the market in financials g...,High Showing good signs of profitability & ef...,Avg The stock is overpriced but is not in the...,Low No red flag found,Higher than Industry Revenue GrowthA higher-th...,...,1526.8,33.66,1.0,0.0,2.0,2.0,0.0,0.0,0.833333,
4,5.0,ACC,ACC Ltd,Low Hasn't faired well - amongst the low perf...,High Seems to be overvalued vs the market ave...,Low Lagging behind the market in financials g...,High Showing good signs of profitability & ef...,Avg The stock is overpriced but is not in the...,Low No red flag found,Lower than Industry Revenue GrowthA higher-tha...,...,871.25,70.59,1.0,0.0,2.0,2.0,0.0,0.0,0.833333,0.960767


In [55]:
combined_data = combined_data.fillna(0)
combined_data = combined_data.replace(r'^\s*$', 0, regex=True)

In [56]:
combined_data.head()

Unnamed: 0,sNo,Company Name,Full Company Name,sc Performance,sc Valuation,sc Growth,sc Profitability,sc Entry Point,sc Red Flags,Income Statements,...,2022 PBT,PE Ratio,Entry_point_Sentiment,Growth_Sentiment,Profitability_Sentiment,RedFlag_Sentiment,Performance_Sentiment,Valuation_Sentiment,Sentiment,News Sentiment Score
0,1.0,ABFRL,Aditya Birla Fashion and Retail Ltd,Low Hasn't faired well - amongst the low perf...,High Seems to be overvalued vs the market ave...,Low Lagging behind the market in financials g...,High Showing good signs of profitability & ef...,Avg The stock is not in the overbought zone,Low No red flag found,Higher than Industry Revenue GrowthA higher-th...,...,-144.93,-522.45,1.0,0.0,2.0,2.0,0.0,0.0,0.833333,0.0
1,2.0,AFFLE,Affle (India) Ltd,Low Hasn't faired well - amongst the low perf...,High Seems to be overvalued vs the market ave...,High Strong financials and growth story over ...,High Showing good signs of profitability & ef...,BadThe stock is overpriced and in the overboug...,Low No red flag found,Higher than Industry Revenue GrowthA higher-th...,...,244.81,55.23,0.0,2.0,2.0,2.0,0.0,0.0,1.0,0.0
2,3.0,AARTIDRUGS,Aarti Drugs Ltd,"Avg Price return has been average, nothing ex...",High Seems to be overvalued vs the market ave...,Low Lagging behind the market in financials g...,High Showing good signs of profitability & ef...,Good The stock is underpriced and is not in t...,Low No red flag found,Higher than Industry Revenue GrowthA higher-th...,...,269.96,26.2,2.0,0.0,2.0,2.0,1.0,0.0,1.166667,0.0
3,4.0,AARTIIND,Aarti Industries Ltd,Low Hasn't faired well - amongst the low perf...,High Seems to be overvalued vs the market ave...,Low Lagging behind the market in financials g...,High Showing good signs of profitability & ef...,Avg The stock is overpriced but is not in the...,Low No red flag found,Higher than Industry Revenue GrowthA higher-th...,...,1526.8,33.66,1.0,0.0,2.0,2.0,0.0,0.0,0.833333,0.0
4,5.0,ACC,ACC Ltd,Low Hasn't faired well - amongst the low perf...,High Seems to be overvalued vs the market ave...,Low Lagging behind the market in financials g...,High Showing good signs of profitability & ef...,Avg The stock is overpriced but is not in the...,Low No red flag found,Lower than Industry Revenue GrowthA higher-tha...,...,871.25,70.59,1.0,0.0,2.0,2.0,0.0,0.0,0.833333,0.960767


In [57]:


X = combined_data[['News Sentiment Score','2019 Total Revenue','2019 EBITDA','2019 Net Income','2019 PBT','2020 Total Revenue','2020 EBITDA','2020 Net Income','2020 PBT','2021 Total Revenue','2021 EBITDA','2021 Net Income','2021 PBT','2022 Total Revenue','2022 EBITDA','2022 Net Income','2022 PBT','PE Ratio']]

# Get the values of X
X_values = X.values
print(X)
X.to_csv('X_values.csv', index=False)


     News Sentiment Score 2019 Total Revenue 2019 EBITDA 2019 Net Income  \
0                0.000000            8182.50      656.10          321.22   
1                0.000000             249.79       70.71           48.82   
2                0.000000            1567.12      217.09           89.75   
3                0.000000            4169.66      967.19          491.73   
4                0.960767           15990.00     2745.23         1377.41   
..                    ...                ...         ...             ...   
493              0.000000             402.83      219.61          153.73   
494              0.000000                  0           0               0   
495              0.000000             164.76       68.02            8.97   
496              0.000000                  0           0               0   
497              0.858800                  0           0               0   

    2019 PBT 2020 Total Revenue 2020 EBITDA 2020 Net Income 2020 PBT  \
0     149.10   

In [58]:
combined_data.to_csv('companyList_xvalues.csv', index=False)

In [59]:

# Create an instance of MinMaxScaler
scaler = MinMaxScaler(feature_range=(0, 2))
# Fit the scaler to the data and transform the values
combined_data['News Sentiment Score'] = scaler.fit_transform(combined_data[['News Sentiment Score']])
combined_data['2019 Total Revenue'] = scaler.fit_transform(combined_data[['2019 Total Revenue']])
combined_data['2020 Total Revenue'] = scaler.fit_transform(combined_data[['2020 Total Revenue']])
combined_data['2021 Total Revenue'] = scaler.fit_transform(combined_data[['2021 Total Revenue']])
combined_data['2022 Total Revenue'] = scaler.fit_transform(combined_data[['2022 Total Revenue']])
combined_data['2019 EBITDA'] = scaler.fit_transform(combined_data[['2019 EBITDA']])
combined_data['2020 EBITDA'] = scaler.fit_transform(combined_data[['2020 EBITDA']])
combined_data['2021 EBITDA'] = scaler.fit_transform(combined_data[['2021 EBITDA']])
combined_data['2022 EBITDA'] = scaler.fit_transform(combined_data[['2022 EBITDA']])
combined_data['2019 Net Income'] = scaler.fit_transform(combined_data[['2019 Net Income']])
combined_data['2020 Net Income'] = scaler.fit_transform(combined_data[['2020 Net Income']])
combined_data['2021 Net Income'] = scaler.fit_transform(combined_data[['2021 Net Income']])
combined_data['2022 Net Income'] = scaler.fit_transform(combined_data[['2022 Net Income']])
combined_data['2019 PBT'] = scaler.fit_transform(combined_data[['2019 PBT']])
combined_data['2020 PBT'] = scaler.fit_transform(combined_data[['2020 PBT']])
combined_data['2021 PBT'] = scaler.fit_transform(combined_data[['2021 PBT']])
combined_data['2022 PBT'] = scaler.fit_transform(combined_data[['2022 PBT']])
combined_data['PE Ratio'] = scaler.fit_transform(combined_data[['PE Ratio']])


In [60]:
combined_data.to_csv('companyList_avg_sentiment_clean_news.csv', index=False)

In [61]:
combined_data = combined_data.fillna(1)

In [63]:
combined_data['Avg Sentiment']=((combined_data['Sentiment']*0.3)+(combined_data['News Sentiment Score']*0.7))

# Define the bins and labels for classification
bins = [0,0.5,0.95, 1.2,1.6, 2]
labels = [0, 1, 2]

# Classify sentiment values into categories
combined_data['Final_Sentiment'] = pd.cut(combined_data['Avg Sentiment'], bins=bins, labels=labels)


combined_data.to_csv('companyList_final_500.csv', index=False,sep=',')


# Using Tpot to create a model

In [32]:
data = pd.read_csv('companyList_final_train.csv')

# Separate the features (X) and target variable (y)
X = data[['News Sentiment Score','Entry_point_Sentiment','Growth_Sentiment','Profitability_Sentiment','RedFlag_Sentiment','Performance_Sentiment','Valuation_Sentiment','2019 Total Revenue','2019 EBITDA','2019 Net Income','2019 PBT','2020 Total Revenue','2020 EBITDA','2020 Net Income','2020 PBT','2021 Total Revenue','2021 EBITDA','2021 Net Income','2021 PBT','2022 Total Revenue','2022 EBITDA','2022 Net Income','2022 PBT','PE Ratio']]
y = data['Final_Sentiment']

# Step 2: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train.shape
X_test.shape

# Step 3: Define and run TPOT
tpot = TPOTClassifier(generations=3, population_size=50, verbosity=2, max_time_mins = 5)
tpot.fit(X_train, y_train)

# Step 4: Evaluate TPOT performance
scorer_names = get_scorer_names()
accuracy = tpot.score(X_test, y_test)
print("Accuracy:", accuracy)

# Step 5: Export the optimized pipeline as Python code

tpot.export('tpot_sentiment_pipeline.py')

Optimization Progress:   0%|          | 0/50 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.9375

Generation 2 - Current best internal CV score: 0.9375

Generation 3 - Current best internal CV score: 0.9375

Best pipeline: MLPClassifier(input_matrix, alpha=0.1, learning_rate_init=0.01)
Accuracy: 0.9


## copying the tpot_sentiment_pipeline code and using joblib to dump exported_pipeline to a pkl file

In [33]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier

# NOTE: Make sure that the outcome column is labeled 'target' in the data file
tpot_data = pd.read_csv('companyList_final_train.csv', sep=',')
columns_to_drop = ['sNo', 'Company Name','Full Company Name','sc Performance','sc Valuation','sc Growth','sc Profitability','sc Entry Point','sc Red Flags','Income Statements','Sentiment','Avg Sentiment']
tpot_data = tpot_data.drop(columns_to_drop, axis=1)

features = tpot_data.drop('Final_Sentiment', axis=1)
training_features, testing_features, training_target, testing_target = \
            train_test_split(tpot_data, tpot_data['Final_Sentiment'], random_state=None)

# Average CV score on the training set was: 0.7625
exported_pipeline = MLPClassifier(alpha=0.1, learning_rate_init=0.01)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)

import joblib
joblib.dump (exported_pipeline,'Sentiment_Analysis_Flag_Data.pkl')

['Sentiment_Analysis_Flag_Data.pkl']

In [34]:
print(training_features.T)

                               31        90        63        98        67  \
2019 Total Revenue       0.011282  0.016076  0.017442  0.012500  0.070571   
2019 EBITDA              0.334319  0.292353  0.283790  0.287279  0.366958   
2019 Net Income          0.931812  0.851082  0.833958  0.852045  0.910771   
2019 PBT                 0.797735  0.733800  0.721980  0.735831  0.810544   
2020 Total Revenue       0.010767  0.014357  0.015128  0.012319  0.067398   
2020 EBITDA              0.423658  0.386854  0.380102  0.383174  0.456853   
2020 Net Income          1.357744  1.311157  1.305920  1.309842  1.355816   
2020 PBT                 1.128405  1.077832  1.075126  1.077958  1.139929   
2021 Total Revenue       0.008559  0.002842  0.009565  0.007068  0.063587   
2021 EBITDA              0.152403  0.081110  0.091086  0.097488  0.187877   
2021 Net Income          1.025758  0.932144  0.945219  0.952533  1.017106   
2021 PBT                 0.963134  0.870564  0.888471  0.893744  0.973518   

## test the original data in dataframe to test model

In [64]:
testdf = pd.read_csv('companyList_final_500.csv', sep=',')
testdf.head()
columns_to_drop = ['sNo', 'Company Name','Full Company Name','sc Performance','sc Valuation','sc Growth','sc Profitability','sc Entry Point','sc Red Flags','Income Statements','Sentiment','Avg Sentiment']
newtestdf = testdf.drop(columns_to_drop, axis=1)
newtestdf.to_csv('Sentiment_Test.csv', index=False)

In [65]:
model = joblib.load(open('Sentiment_Analysis_Flag_Data.pkl', 'rb'))
predictions = pd.DataFrame(model.predict(newtestdf), columns = ['prediction'])
final = pd.concat([predictions, newtestdf], axis = 1)
print(final)
final.to_csv('Sentiment_Test_Prediction.csv', index=False)

     prediction  2019 Total Revenue  2019 EBITDA  2019 Net Income  2019 PBT  \
0             0            0.028328     0.287447         0.852087  0.727966   
1             1            0.000865     0.276550         0.844124  0.725903   
2             1            0.005425     0.279275         0.845321  0.727548   
3             0            0.014435     0.293237         0.857072  0.738887   
4             2            0.055358     0.326335         0.882964  0.771926   
..          ...                 ...          ...              ...       ...   
493           0            0.001395     0.279321         0.847191  0.729498   
494           0            0.000000     0.275234         0.842697  0.724522   
495           0            0.000570     0.276500         0.842959  0.724843   
496           0            0.000000     0.275234         0.842697  0.724522   
497           2            0.000000     0.275234         0.842697  0.724522   

     2020 Total Revenue  2020 EBITDA  2020 Net Inco