# Importing data and libraries

In [None]:
import numpy as np
import pandas as pd
pd.set_option('display.max_colwidth', None)

import re

import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [None]:
df_auto = pd.read_excel('/content/drive/MyDrive/Sentiment_Analysis_Stock_Prediction/DATA/automobile.xlsx', index_col=0)
df_fashion = pd.read_excel('/content/drive/MyDrive/Sentiment_Analysis_Stock_Prediction/DATA/fashion.xlsx', index_col=0)
df_finance = pd.read_excel('/content/drive/MyDrive/Sentiment_Analysis_Stock_Prediction/DATA/finance.xlsx', index_col=0)
df_tech = pd.read_excel('/content/drive/MyDrive/Sentiment_Analysis_Stock_Prediction/DATA/tech.xlsx', index_col=0)

In [None]:
df_auto_orig = df_auto
df_fashion_orig = df_fashion
df_finance_orig = df_finance
df_tech_orig = df_tech

# Merging the dataframes

In [None]:
frames = [df_auto, df_fashion, df_finance, df_tech]
df = pd.concat(frames)

In [None]:
df_orig = df

In [None]:
df = df_orig

# Data pre-processing

### Dropping duplicate rows

In [None]:
print('Shape before : ', df.shape)
print('Number of duplicated rows : ', df[df.duplicated()].shape[0])
df = df.drop_duplicates()
print('Shape after : ', df.shape)

Shape before :  (129866, 6)
Number of duplicated rows :  2654
Shape after :  (127212, 6)


### Dropping sources and companies under threshold

In [None]:
thresh_source = 30
thresh_company = 20

In [None]:
df = df.groupby('source').filter(lambda x: len(x) > thresh_source).reset_index(drop=True)
print('Number of sources remaining =>',len(df['source'].unique()))
print(df.shape)

Number of sources remaining => 75
(127090, 6)


In [None]:
df = df.groupby('company').filter(lambda x: len(x) > thresh_company).reset_index(drop=True)
print('Number of companies remaining =>',len(df['company'].unique()))
print(df.shape)

Number of companies remaining => 75
(127048, 6)


### Basic pre-processing steps

In [None]:
stoplist= set(stopwords.words("english"))

def remove_stopwords(text):
    tokens= word_tokenize(text)
    res = " ".join(w.lower() for w in tokens if not w.lower() in stoplist)
    return res

In [None]:
def clean(text):
# Removes all special characters and numericals leaving the alphabets
    text = re.sub('[^A-Za-z]+', ' ', text)
    return text

In [None]:
def remove_punctuation(text):
    final = "".join(u for u in text if u not in ("?", ".", ";", ":",  "!",'"'))
    return final

In [None]:
lemmatizer = WordNetLemmatizer()

def lemmatize_sentence(text):
  word_list = nltk.word_tokenize(text)
  lemmatized_output = ' '.join([lemmatizer.lemmatize(w) for w in word_list])
  return lemmatized_output

In [None]:
def format_strings(text):
  text = re.sub('[^A-Za-z0-9]', ' ', text)
  return text

In [None]:
def pre_processing(text):
    text = remove_punctuation(text)
    text = clean(text)
    text = remove_stopwords(text)
    text = lemmatize_sentence(text)
    return text

In [None]:
df['text'] = df['text'].apply(pre_processing)
df['source'] = df['source'].apply(format_strings)
df['title'] = df['title'].apply(format_strings)

In [None]:
df.shape

(127048, 6)

# Save output to xlsx

In [None]:
df.to_excel('/content/drive/MyDrive/Sentiment_Analysis_Stock_Prediction/DATA/clean_dataset.xlsx', index=False)