In [81]:
import pandas as pd
import numpy as np
import random
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize, PunktSentenceTokenizer
from nltk.corpus import stopwords
from difflib import SequenceMatcher
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from bs4 import BeautifulSoup
from nltk.tokenize import sent_tokenize

In [82]:
import warnings
warnings.filterwarnings('ignore')

In [83]:
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [84]:
df=pd.read_csv('F:\Stock-Market Sentiment project\stock_data.csv')
df.sample(10)

Unnamed: 0,Text,Sentiment
2223,SB WFC Broke trend line Ho hum CA,1
2510,agree user: hedge funds sold AAP in Q4. We'll ...,1
189,GDOT nice bounce from its 50SMA. ooks ready to...,1
3938,PO upside volume tailed off a little and some ...,1
126,Man I didnt know about Skippy when I made this...,1
5327,The U.S.â€™s national medical stockpile has se...,1
2419,BAC ook evel2 lot of selling occur from funds !,-1
5544,British Airways In Talks With Union To Suspend...,-1
4296,AFFY back to a full position size with avg cos...,1
729,Why can't they take AAP out of the index? It's...,-1


In [85]:
df.shape

(5791, 2)

In [86]:
df.isnull().sum()

Text         0
Sentiment    0
dtype: int64

# Data Cleaning

In [87]:
df['Sentiment'].replace(-1,0,inplace=True)

In [88]:
df['Sentiment'].unique()

array([1, 0], dtype=int64)

#### Cleaning text of text columns

In [89]:
def preprocess(q):
    
    q = str(q).lower().strip()
    
    q = q.replace('%', ' percent')
    q = q.replace('$', ' dollar ')
    q = q.replace('₹', ' rupee ')
    q = q.replace('€', ' euro ')
    q = q.replace('@', ' at ')
    
    q = q.replace('[math]', '')
    
    q = q.replace(',000,000,000 ', 'b ')
    q = q.replace(',000,000 ', 'm ')
    q = q.replace(',000 ', 'k ')
    q = re.sub(r'([0-9]+)000000000', r'\1b', q)
    q = re.sub(r'([0-9]+)000000', r'\1m', q)
    q = re.sub(r'([0-9]+)000', r'\1k', q)
    
    contractions = { 
    "ain't": "am not",
    "aren't": "are not",
    "can't": "can not",
    "can't've": "can not have",
    "'cause": "because",
    "could've": "could have",
    "couldn't": "could not",
    "couldn't've": "could not have",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hadn't've": "had not have",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he would",
    "he'd've": "he would have",
    "he'll": "he will",
    "he'll've": "he will have",
    "he's": "he is",
    "how'd": "how did",
    "how'd'y": "how do you",
    "how'll": "how will",
    "how's": "how is",
    "i'd": "i would",
    "i'd've": "i would have",
    "i'll": "i will",
    "i'll've": "i will have",
    "i'm": "i am",
    "i've": "i have",
    "isn't": "is not",
    "it'd": "it would",
    "it'd've": "it would have",
    "it'll": "it will",
    "it'll've": "it will have",
    "it's": "it is",
    "let's": "let us",
    "ma'am": "madam",
    "mayn't": "may not",
    "might've": "might have",
    "mightn't": "might not",
    "mightn't've": "might not have",
    "must've": "must have",
    "mustn't": "must not",
    "mustn't've": "must not have",
    "needn't": "need not",
    "needn't've": "need not have",
    "o'clock": "of the clock",
    "oughtn't": "ought not",
    "oughtn't've": "ought not have",
    "shan't": "shall not",
    "sha'n't": "shall not",
    "shan't've": "shall not have",
    "she'd": "she would",
    "she'd've": "she would have",
    "she'll": "she will",
    "she'll've": "she will have",
    "she's": "she is",
    "should've": "should have",
    "shouldn't": "should not",
    "shouldn't've": "should not have",
    "so've": "so have",
    "so's": "so as",
    "that'd": "that would",
    "that'd've": "that would have",
    "that's": "that is",
    "there'd": "there would",
    "there'd've": "there would have",
    "there's": "there is",
    "they'd": "they would",
    "they'd've": "they would have",
    "they'll": "they will",
    "they'll've": "they will have",
    "they're": "they are",
    "they've": "they have",
    "to've": "to have",
    "wasn't": "was not",
    "we'd": "we would",
    "we'd've": "we would have",
    "we'll": "we will",
    "we'll've": "we will have",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what'll": "what will",
    "what'll've": "what will have",
    "what're": "what are",
    "what's": "what is",
    "what've": "what have",
    "when's": "when is",
    "when've": "when have",
    "where'd": "where did",
    "where's": "where is",
    "where've": "where have",
    "who'll": "who will",
    "who'll've": "who will have",
    "who's": "who is",
    "who've": "who have",
    "why's": "why is",
    "why've": "why have",
    "will've": "will have",
    "won't": "will not",
    "won't've": "will not have",
    "would've": "would have",
    "wouldn't": "would not",
    "wouldn't've": "would not have",
    "y'all": "you all",
    "y'all'd": "you all would",
    "y'all'd've": "you all would have",
    "y'all're": "you all are",
    "y'all've": "you all have",
    "you'd": "you would",
    "you'd've": "you would have",
    "you'll": "you will",
    "you'll've": "you will have",
    "you're": "you are",
    "you've": "you have"
    }

    q_decontracted = []

    for word in q.split():
        if word in contractions:
            word = contractions[word]

        q_decontracted.append(word)

    q = ' '.join(q_decontracted)
    q = q.replace("'ve", " have")
    q = q.replace("n't", " not")
    q = q.replace("'re", " are")
    q = q.replace("'ll", " will")
    
    q = BeautifulSoup(q)
    q = q.get_text()
    
    pattern = re.compile('\W')
    q = re.sub(pattern, ' ', q).strip()

    
    return q
    

In [90]:
df['Text'] = df['Text'].apply(preprocess)


#### No of sentence present in a cell

In [91]:

senL = []
for i in df['Text']:
    sentences = sent_tokenize(i)
    
    senL.append(len(sentences))

df['Number_of_sentences_cell'] = senL



"\nsenL = []\nfor i in df['Text']:\n    sentences = sent_tokenize(i)\n    \n    senL.append(len(sentences))\n\ndf['Number_of_sentences_cell'] = senL\n\n"

#### removing puntuation

In [92]:

for i in df['Text']:
    translator = str.maketrans('', '', string.punctuation)
    clean_sentence = i.translate(translator)
    df=df.replace(i,clean_sentence)
   

"\nfor i in df['Text']:\n    translator = str.maketrans('', '', string.punctuation)\n    clean_sentence = i.translate(translator)\n    df=df.replace(i,clean_sentence)\n"

#### How many numerical values are containing by each sentence

In [93]:

num_count=[]
for i in df['Text']:

    number_pattern = r'\d+'

    # Search for the pattern in the sentence
    matches = re.findall(number_pattern, i)

    numerical_count = len(matches)
    num_count.append(numerical_count)

df['count_of_numerical_val']=num_count


"\nnum_count=[]\nfor i in df['Text']:\n\n    number_pattern = r'\\d+'\n\n    # Search for the pattern in the sentence\n    matches = re.findall(number_pattern, i)\n\n    numerical_count = len(matches)\n    num_count.append(numerical_count)\n\ndf['count_of_numerical_val']=num_count\n"

#### Number of words present in sentence column

In [94]:
df["no_of_words_in_sentences"]=df['Text'].apply(lambda row: len(row.split(" ")))

#### Count of stopwords of a sentence in a cell

In [95]:

count_stopWords=[]
for i in  df['Text']:
    stop_words = set(stopwords.words('english'))
    words = nltk.word_tokenize(i)
    stopword_count = sum(1 for word in words if word.lower() in stop_words)
    count_stopWords.append(stopword_count)

df['count_stopWords']=count_stopWords


"\ncount_stopWords=[]\nfor i in  df['Text']:\n    stop_words = set(stopwords.words('english'))\n    words = nltk.word_tokenize(i)\n    stopword_count = sum(1 for word in words if word.lower() in stop_words)\n    count_stopWords.append(stopword_count)\n\ndf['count_stopWords']=count_stopWords\n"

#### length of sentence of Information column

In [96]:
df["length_of_sentence"]=df["Text"].str.len()


#### Taking only verb, adverb, noun and adjective of the sentece

In [97]:
import nltk
from nltk import pos_tag, word_tokenize
from nltk.corpus import stopwords

sentence=[]


for i in df['Text']:
    
    words = word_tokenize(i)

    
    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in words if word.lower() not in stop_words]

    
    tagged_words = pos_tag(filtered_words)

    
    verbs = [word for word, tag in tagged_words if tag.startswith('VB')]
    adverbs = [word for word, tag in tagged_words if tag.startswith('RB')]
    nouns = [word for word, tag in tagged_words if tag.startswith('NN')]
    adjectives = [word for word, tag in tagged_words if tag.startswith('JJ')]
    
    tmp_li = ""  
    for j in i.split():
        if j in verbs or j in adverbs or j in nouns or j in adjectives:
            tmp_li = tmp_li + " " + str(j)
        else:
            pass
    sentence.append(tmp_li.strip())
    

xxx={}    
xxx['Sentences']=sentence


yyy=pd.DataFrame(xxx)
df = pd.merge(df, yyy, left_index=True, right_index=True)

df.head()



Unnamed: 0,Text,Sentiment,Sentences
0,kickers on my watchlist xide tit soq pnk cpw b...,1,kickers watchlist tit soq pnk cpw bpz aj trade...
1,user aap movie 55 percent return for the fea...,1,user aap movie percent return fea geed indicat...
2,user i would be afraid to short amzn they ar...,1,user afraid short amzn looking monopoly ebooks...
3,mnta over 12 00,1,mnta
4,oi over 21 37,1,oi


In [98]:
df.drop('Text',axis=1, inplace=True)

In [99]:
sentence

['kickers watchlist tit soq pnk cpw bpz aj trade method method see prev posts',
 'user aap movie percent return fea geed indicator trades year awesome',
 'user afraid short amzn looking monopoly ebooks infrastructure service',
 'mnta',
 'oi',
 'pgnx',
 'aap user current downtrend break otherwise short term correction med term downtrend',
 'monday relative weakness nyx win tie tap ice int bmc aon c chk biib',
 'goog ower trend line channel test volume support',
 'aap watch tomorrow entry',
 'assuming fcx opens tomorrow trigger buy still much setup',
 'really worries everyone expects market rally usually exact opposite happens time see soon bac spx jpm',
 'aap gamco arry haverty apple extremely cheap great video',
 'user maykiljil posted agree msft going higher possibly north',
 'momentum coming back etfc broke ma200 resistance solid volume friday set',
 'ha hitting means resume targeting level',
 'user gameplan shot today liked trend break c h break oc weekly trend break back july',
 'f

#### Total word count

In [100]:
df["Word_count"]=df['Sentences'].apply(lambda row: len(row.split(" ")))


In [101]:
df.head()

Unnamed: 0,Sentiment,Sentences
0,1,kickers watchlist tit soq pnk cpw bpz aj trade...
1,1,user aap movie percent return fea geed indicat...
2,1,user afraid short amzn looking monopoly ebooks...
3,1,mnta
4,1,oi


In [102]:
df.shape

(5791, 2)

In [104]:
df.isnull().sum()

Sentiment    0
Sentences    0
dtype: int64

In [103]:
df.to_excel('myData.xlsx')


# Data Preprocessing & Model Training

In [72]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['Sentences'])
count_vectorized_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())



In [74]:
count_vectorized_df.head()

Unnamed: 0,00,10th,2013,50cents,60ish,7xu6leolls,a0nevebvqw,a0x6ij3pps,a15,a1vkeup3kz,...,œessentialâ,œhysteresisâ,œi,œpandemic,œpriority,œrecalibratingâ,œsince,œthen,œwe,œyou
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [73]:
df.head()

Unnamed: 0,Sentiment,Sentences
0,1,kickers watchlist tit soq pnk cpw bpz aj trade...
1,1,user aap movie percent return fea geed indicat...
2,1,user afraid short amzn looking monopoly ebooks...
3,1,mnta
4,1,oi


In [75]:
df = pd.merge(df, count_vectorized_df, left_index=True, right_index=True)
df.drop('Sentences',axis=1, inplace=True)
df

Unnamed: 0,Sentiment,00,10th,2013,50cents,60ish,7xu6leolls,a0nevebvqw,a0x6ij3pps,a15,...,œessentialâ,œhysteresisâ,œi,œpandemic,œpriority,œrecalibratingâ,œsince,œthen,œwe,œyou
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5786,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5787,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5788,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5789,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [80]:
df.iloc[0]

Sentiment          1
00                 0
10th               0
2013               0
50cents            0
                  ..
œrecalibratingâ    0
œsince             0
œthen              0
œwe                0
œyou               0
Name: 0, Length: 8807, dtype: int64

In [29]:
x.shape

(5791, 8806)

In [23]:
y=df["Sentiment"]
x=df.drop('Sentiment',axis=1)
x.columns = x.columns.astype(str)


In [24]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(x,y,test_size=0.105,random_state=1232224)

In [25]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
rf = RandomForestClassifier()
rf.fit(X_train,y_train)
y_pred = rf.predict(X_test)
accuracy_score(y_test,y_pred)

0.8013136288998358

In [28]:
import pickle
pickle.dump(rf,open('stock_mdl2.pkl','wb'))