In [24]:
# Import pandas for data handling
import pandas as pd
import numpy as np
# NLTK is our Natural-Language-Took-Kit
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

# Libraries for helping us with strings
import string
# Regular Expression Library
import re

# Download these from nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
stopwords = stopwords.words('english')

[nltk_data] Downloading package stopwords to C:\Users\Sushobhan
[nltk_data]     Parajuli\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Sushobhan
[nltk_data]     Parajuli\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Sushobhan
[nltk_data]     Parajuli\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
df = pd.read_csv("data/is_up.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,title,date,stock,is_up
0,0,Stocks That Hit 52-Week Highs On Friday,2020-06-05,A,0
1,1,Stocks That Hit 52-Week Highs On Wednesday,2020-06-03,A,1
2,2,71 Biggest Movers From Friday,2020-05-26,A,1
3,3,46 Stocks Moving In Friday's Mid-Day Session,2020-05-22,A,1
4,4,B of A Securities Maintains Neutral on Agilent...,2020-05-22,A,1


In [3]:
df = df.drop(columns="Unnamed: 0")
df

Unnamed: 0,title,date,stock,is_up
0,Stocks That Hit 52-Week Highs On Friday,2020-06-05,A,0
1,Stocks That Hit 52-Week Highs On Wednesday,2020-06-03,A,1
2,71 Biggest Movers From Friday,2020-05-26,A,1
3,46 Stocks Moving In Friday's Mid-Day Session,2020-05-22,A,1
4,B of A Securities Maintains Neutral on Agilent...,2020-05-22,A,1
...,...,...,...,...
337221,IPO for Pfizer's Zoetis Prices 86.1M Shares at...,2013-01-31,ZTS,Na
337222,"ISI Group Initiates Coverage on Zoetis at Buy,...",2013-01-31,ZTS,Na
337223,"Pfizer, Spinoff Zoetis Receive Positive Mad Mo...",2013-01-23,ZTS,Na
337224,Will These 2 IPOs Flourish?,2013-01-22,ZTS,Na


In [5]:
# delete all the rows that contain Na and 2 in is_up
df = df [df ["is_up"].str.contains("Na")==False] 
df = df [df ["is_up"].str.contains("2")==False] 
df

Unnamed: 0,title,date,stock,is_up
0,Stocks That Hit 52-Week Highs On Friday,2020-06-05,A,0
1,Stocks That Hit 52-Week Highs On Wednesday,2020-06-03,A,1
2,71 Biggest Movers From Friday,2020-05-26,A,1
3,46 Stocks Moving In Friday's Mid-Day Session,2020-05-22,A,1
4,B of A Securities Maintains Neutral on Agilent...,2020-05-22,A,1
...,...,...,...,...
337210,Hilliard Lyons Initiates Coverage on Zoetis at...,2013-03-12,ZTS,0
337211,Hilliard Lyons Initiates Coverage on Zoetis at...,2013-03-12,ZTS,0
337212,UPDATE: BMO Capital Markets Initiates Zoetis a...,2013-03-08,ZTS,0
337213,BMO Capital Initiates Coverage on Zoetis at Un...,2013-03-08,ZTS,0


In [6]:
# Lowercase all words
def make_lower(a_string):
    return a_string.lower()

# Remove all punctuation

def remove_punctuation(a_string):    
    a_string = re.sub(r'[^\w\s]','',a_string)
    return a_string

def remove_number(a_string):
    a_string = re.sub(r'[0-9]', '', a_string)
    return a_string

In [7]:
def remove_stopwords(a_string):
    #break the sentence into a list of words
    words = word_tokenize(a_string)
    #make a list to append valid words into
    valid_words = []
    #loop through all the words
    for word in words:
        if word not in stopwords:
            valid_words.append(word)
    a_string = ' '.join(valid_words)
    return a_string

In [8]:
def stem_words(a_string):
    # Initalize our Stemmer
    porter = PorterStemmer()
    
    # Break the sentence down into a list of words
    words = word_tokenize(a_string)
    
    # Make a list to append valid words into
    valid_words = []

    # Loop through all the words
    for word in words:
        # Stem the word
        stemmed_word = porter.stem(word)
        
        # Append stemmed word to our valid_words
        valid_words.append(stemmed_word)
        
    # Join the list of words together into a string
    a_string = ' '.join(valid_words)

    return a_string 

In [9]:
def text_pipeline(input_string):
    input_string = make_lower(input_string)
    input_string = remove_punctuation(input_string)
    input_string = stem_words(input_string)
    input_string = remove_number(input_string)
    input_string = remove_stopwords(input_string)
    return input_string

In [10]:
df['title'] = df['title'].apply(text_pipeline)

In [11]:
c1 = df['is_up'] == '1'

df['movement'] = np.where( c1, "Up", "Down")

df = df.drop(columns="is_up")

df.head()

Unnamed: 0,title,date,stock,movement
0,stock hit week high friday,2020-06-05,A,Down
1,stock hit week high wednesday,2020-06-03,A,Up
2,biggest mover friday,2020-05-26,A,Up
3,stock move friday midday session,2020-05-22,A,Up
4,b secur maintain neutral agil technolog rais p...,2020-05-22,A,Up


In [38]:
df.title.tail(30)

337185           zoeti report q ep vs est revenu b vs b est
337186    bmo capit market reiter underperform rate pt z...
337187        bmo capit maintain underperform zoeti rais pt
337188                                   benzinga top initi
337189         citigroup initi coverag zoeti buy announc pt
337190    updat credit suiss initi zoeti neutral stock c...
337191                                   benzinga top initi
337192    credit suiss initi coverag zoeti neutral annou...
337193                   analyst initi coverag zoeti believ
337194                                   benzinga top initi
337195    updat piper jaffray initi zoeti overweight gro...
337196    updat jefferi initi zoeti buy strong growth pr...
337197    updat jp morgan initi zoeti overweight leaders...
337198    updat deutsch bank initi zoeti buy market lead...
337199    updat bank america initi zoeti neutral growth ...
337200    updat goldman sach initi zoeti neutral upsid l...
337201    updat morgan stanley initi zoe

In [40]:
#df.to_csv('data/up_down_final',index=False)