In [1]:
#import pandas
import pandas as pd
#read the csv training file
train = pd.read_csv("train_E6oV3lV.csv",index_col="id")
#exploratory data analysis
print("Dataframe shape:",train.shape)
print("Columns:",train.columns.values)
print(train.head())

Dataframe shape: (31962, 2)
Columns: ['label' 'tweet']
    label                                              tweet
id                                                          
1       0   @user when a father is dysfunctional and is s...
2       0  @user @user thanks for #lyft credit i can't us...
3       0                                bihday your majesty
4       0  #model   i love u take with u all the time in ...
5       0             factsguide: society now    #motivation


In [2]:
#import BeautifulSoup for data cleaning
from bs4 import BeautifulSoup
#import regexp
import re
#import stopwords from nltk
from nltk.corpus import stopwords

In [3]:
#use WordNetLemmatizer to lemmatize the words
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
def lemmatize_tweets(raw_review):
    lemmatized_tweet=[]
    raw_review = raw_review.split()
    for w in raw_review:
        word = wordnet_lemmatizer.lemmatize(w)
        lemmatized_tweet.append(word)
    return(" ".join(lemmatized_tweet))
print(lemmatize_tweets("plays in sand and listens"))

play in sand and listens


In [4]:
#function to segregate only tweets
#defining function separate_tweets() to separate hashtags from main tweet
def separate_tweets(raw_review):
    #changing the whole tweet in lowercase so that hashtags with different cases can match and not be identified as different hashtags
    tweets_only = raw_review.lower()
    #finding all the hashtags
    tweets_only = re.findall("#\w+",tweets_only)
    #returning all hashtags as list
    return(" ".join(tweets_only))
#checking if our function works properly
print(separate_tweets("#i_am_a_good_boy i ama sick # #i_am_tired ##"))
print(separate_tweets("i am going to change everything #i ## #i_am ### @i @@ @# #@   @a@ab@abc #iam#i## #... #I_am_Avinesh"))
#it works :)

#i_am_a_good_boy #i_am_tired
#i #i_am #iam #i #i_am_avinesh


In [5]:
#stopwords stored in a set so that we dont have to access the nltk corpus everytimr
stops = set(stopwords.words("english"))
#function to separate all the words in the whole tweet including separation of words from their hashes and mentions
def review_tweets(raw_review):
    review_text = BeautifulSoup(raw_review,"html.parser").get_text()
    #utilising only the letters only from the whole tweet
    #this separates the words from their # and @
    letters_only = re.sub("[^a-zA-Z]"," ",review_text)
    #changing the whole tweet in lowercase so that tweets with different cases can match and not be identified as different tweets
    words = letters_only.lower().split()
    #splitting the whole tweet on the basis of white space
    meaningful_words = [w for w in words if w not in stops]
    return(lemmatize_tweets((" ".join(meaningful_words))))
#checking if our function works properly
print(review_tweets("i am going to change everything #i ## #i_am ### @i @@ @# #@   @a@ab@abc #iam#i## #... plays"))
#it works

going change everything ab abc iam play


In [6]:
#calculating the total size of the array
num_reviews = train.tweet.size
clean_tweets = []
clean_hashtags = []
for i in range(1,num_reviews):
    clean_tweets.append(review_tweets(train.tweet[i]))
    clean_hashtags.append(separate_tweets(train.tweet[i]))
    if(i%3000 == 0):
        print(i," tweets processed. . . . ")
print("processing completed")

3000  tweets processed. . . . 
6000  tweets processed. . . . 
9000  tweets processed. . . . 
12000  tweets processed. . . . 
15000  tweets processed. . . . 
18000  tweets processed. . . . 
21000  tweets processed. . . . 
24000  tweets processed. . . . 
27000  tweets processed. . . . 
30000  tweets processed. . . . 
processing completed


In [7]:
#creating bag of words
from sklearn.feature_extraction.text import CountVectorizer
vectorizer1 = CountVectorizer(analyzer="word",
                            tokenizer=None,
                            preprocessor=None,
                            stop_words=None,
                            max_features=10000)
vectorizer2 = CountVectorizer(analyzer="word",
                            tokenizer=None,
                            preprocessor=None,
                            stop_words=None,
                            max_features=1000)
print("fitting tweets to vectorizer")
tweets_data_features = vectorizer1.fit_transform(clean_tweets)
print("fitting hashtags to vectorizer")
hashtags_data_features = vectorizer2.fit_transform(clean_hashtags)
#changing it to numpy array as it is easy to wrok with them
print("changing all vectors in number form")
tweets_data_features = tweets_data_features.toarray()
hashtag_data_features = hashtags_data_features.toarray()

fitting tweets to vectorizer
fitting hashtags to vectorizer
changing all vectors in number form


In [8]:
print("Tweet bag of words:",tweets_data_features.shape)
print("Hashtag bag of words",hashtag_data_features.shape)
print("Feature names:",vectorizer1.get_feature_names())

Tweet bag of words: (31961, 10000)
Hashtag bag of words (31961, 1000)


In [9]:
import numpy as np
tweet_dist = np.sum(tweets_data_features,axis=0)
hashtag_dist = np.sum(hashtag_data_features,axis=0)
for tag,count in zip(vectorizer1.get_feature_names(),tweet_dist):
    print(count," : ",tag)
for tag,count in zip(vectorizer2.get_feature_names(),hashtag_dist):
    print(count," : ",tag)

2  :  aa
3  :  aaa
23  :  aap
3  :  aaron
9  :  ab
7  :  abandoned
4  :  abasel
4  :  abba
9  :  abc
4  :  abe
7  :  ability
3  :  abitur
73  :  able
3  :  ableg
5  :  ableism
3  :  aboion
3  :  aboutlastnight
10  :  abrahamhicks
4  :  abroad
3  :  absence
13  :  absolute
43  :  absolutely
4  :  abstract
3  :  abstracta
3  :  absurd
23  :  abt
8  :  abundance
17  :  abuse
3  :  abusing
4  :  abusive
5  :  ac
3  :  acab
7  :  academy
7  :  acc
7  :  accent
29  :  accept
10  :  acceptable
6  :  acceptance
27  :  accepted
6  :  accepting
13  :  access
3  :  accessnetwork
13  :  accessory
12  :  accident
4  :  accidentally
3  :  accomplishment
10  :  according
58  :  account
5  :  accountable
3  :  accusation
3  :  accuse
4  :  accused
11  :  ace
8  :  ache
13  :  achieve
4  :  achieved
8  :  achievement
4  :  aching
4  :  acknowledge
15  :  acne
6  :  acoustic
3  :  acquainted
4  :  acquired
3  :  acre
22  :  across
87  :  act
4  :  acted
22  :  acting
50  :  action
12  :  active
8  :  ac

5  :  bridetobe
20  :  bridge
5  :  brief
3  :  brigade
26  :  bright
6  :  brighten
18  :  brighton
31  :  brilliant
63  :  bring
12  :  bringing
3  :  bringit
8  :  bringiton
19  :  brings
5  :  brisbane
7  :  brisk
8  :  bristol
4  :  brit
6  :  britain
9  :  brithday
17  :  british
3  :  brittany
33  :  bro
3  :  broad
5  :  broadcast
4  :  broader
10  :  broadway
8  :  brochure
3  :  brock
38  :  broke
67  :  broken
6  :  brokenhea
5  :  brokenquotes
29  :  broker
6  :  bron
5  :  brook
10  :  brooklyn
5  :  bros
77  :  brother
22  :  brought
5  :  brow
31  :  brown
7  :  browning
5  :  browser
8  :  bruh
16  :  brunch
16  :  brunette
3  :  brunomars
3  :  brush
7  :  brussels
6  :  brutal
6  :  brutality
3  :  bsb
3  :  bst
9  :  bt
22  :  bts
15  :  btw
4  :  bu
23  :  bubble
7  :  bubbly
6  :  buck
6  :  bucket
4  :  bud
20  :  buddy
6  :  budget
144  :  buffalo
5  :  bug
23  :  build
3  :  builder
31  :  building
6  :  built
3  :  buisness
4  :  bulgaria
506  :  bull
3  :  bul

15  :  democrat
6  :  democratic
5  :  democraticpay
9  :  dems
4  :  den
14  :  denial
3  :  denied
7  :  denim
3  :  dennis
5  :  denounce
4  :  dental
4  :  dentist
4  :  deny
5  :  depament
3  :  depaure
4  :  depaures
3  :  depend
5  :  deplorable
3  :  deplorables
4  :  depp
76  :  depressed
3  :  depressededit
21  :  depressing
124  :  depression
3  :  dept
7  :  depth
3  :  derogatory
4  :  des
3  :  descending
17  :  describe
9  :  dese
37  :  deserve
5  :  deserved
14  :  deserves
70  :  design
10  :  designed
8  :  designer
5  :  designing
13  :  desire
7  :  desk
19  :  desperate
3  :  desperately
5  :  despise
66  :  despite
10  :  desse
10  :  destination
14  :  destiny
14  :  destroy
11  :  destroyed
4  :  destroying
7  :  destruction
3  :  destructive
20  :  detail
8  :  determination
12  :  determined
3  :  detour
7  :  detox
15  :  detoxdiet
7  :  detroit
11  :  dev
3  :  devastated
5  :  devastating
3  :  develop
8  :  developer
4  :  developing
84  :  development
4 

51  :  fly
5  :  flyer
29  :  flying
5  :  fm
13  :  fml
37  :  fo
26  :  focus
11  :  focused
4  :  focusing
4  :  folding
53  :  folk
372  :  follow
8  :  followback
16  :  followed
94  :  follower
3  :  followforfollow
36  :  following
159  :  followme
9  :  follows
3  :  followusoninstagram
3  :  folx
4  :  fomc
272  :  food
4  :  foodblogger
34  :  foodie
33  :  foodporn
4  :  foodstagram
20  :  fool
5  :  fooled
6  :  foolish
42  :  foot
5  :  footage
79  :  football
3  :  footballer
3  :  footie
3  :  footy
20  :  force
8  :  forced
6  :  ford
16  :  forecast
3  :  forecasted
6  :  foreign
12  :  forest
95  :  forever
4  :  foreverliving
265  :  forex
64  :  forget
3  :  forgetting
3  :  forgive
8  :  forgiveness
4  :  forgiver
27  :  forgot
13  :  forgotten
4  :  fork
22  :  form
3  :  format
3  :  formentera
12  :  former
3  :  forming
7  :  forum
167  :  forward
3  :  foster
6  :  fotokuapp
12  :  fought
6  :  fouh
9  :  founate
191  :  found
3  :  founder
3  :  fountain
24  

7  :  homeopathic
5  :  homeowner
3  :  homesick
2  :  homesta
7  :  homesweethome
5  :  hometown
6  :  homework
3  :  homicide
5  :  homies
2  :  homo
5  :  homophobe
11  :  homophobia
19  :  homophobic
3  :  homosexuality
17  :  honest
15  :  honestly
5  :  honesty
11  :  honey
4  :  honeymoon
8  :  hong
5  :  hongkong
12  :  honor
9  :  honored
2  :  honoring
5  :  honour
2  :  honoured
5  :  hoo
9  :  hood
2  :  hooded
4  :  hoodie
4  :  hoodies
8  :  hook
2  :  hookah
5  :  hooked
2  :  hookedbyaldi
3  :  hooker
2  :  hookup
6  :  hooligan
2  :  hoop
4  :  hooray
2  :  hop
272  :  hope
16  :  hopeful
30  :  hopefully
3  :  hopeless
28  :  hoping
2  :  hopkins
3  :  hopping
3  :  horizon
8  :  hormone
2  :  horn
52  :  horny
38  :  horrible
4  :  horribly
19  :  horrific
2  :  horrified
4  :  horrifying
20  :  horror
23  :  horse
2  :  horselovers
6  :  hoshi
23  :  hospital
2  :  hospitality
20  :  host
3  :  hosted
3  :  hostel
2  :  hostility
6  :  hosting
165  :  hot
37  :  hot

24  :  kkk
5  :  klan
3  :  klopp
10  :  km
9  :  knee
38  :  knew
7  :  knitting
7  :  knock
6  :  knocked
3  :  knossos
3  :  knot
471  :  know
29  :  knowing
18  :  knowledge
4  :  knowledgeable
25  :  known
3  :  knuckle
5  :  knw
5  :  ko
4  :  koala
3  :  kolkata
5  :  kong
3  :  kongebloggen
3  :  koran
17  :  korea
9  :  korean
3  :  korematsu
3  :  kp
14  :  kpop
6  :  kro
8  :  kscrashcorrectors
7  :  ku
5  :  kudos
6  :  ky
5  :  kylie
4  :  kylielipkit
4  :  kyrie
67  :  la
10  :  lab
9  :  label
7  :  labour
3  :  labrador
32  :  lack
3  :  lacking
11  :  lad
83  :  lady
8  :  ladyboy
5  :  laid
3  :  lajawab
29  :  lake
3  :  lalala
4  :  lamb
9  :  lame
3  :  lamp
4  :  lancaster
36  :  land
6  :  landed
5  :  landholding
6  :  landing
10  :  landscape
7  :  lane
8  :  lang
17  :  language
5  :  lap
4  :  lapride
13  :  laptop
13  :  large
9  :  largest
5  :  larry
3  :  lash
341  :  last
4  :  lastday
3  :  lastdayofschool
5  :  lasted
6  :  lasting
4  :  lastnight
16  

43  :  nation
55  :  national
9  :  nationalbestfriendday
17  :  nationalbestfriendsday
10  :  nationalist
3  :  nationality
3  :  nationallobsterday
3  :  nationalroseday
5  :  native
5  :  natsu
32  :  natural
3  :  naturalhair
6  :  naturally
94  :  nature
53  :  naughty
11  :  navy
24  :  nazi
42  :  nba
54  :  nbafinals
4  :  nbc
3  :  nbjc
5  :  nc
73  :  nd
5  :  nda
6  :  ne
33  :  near
3  :  nearest
35  :  nearly
3  :  nebraska
3  :  necessity
4  :  neck
12  :  necklace
607  :  need
3  :  needa
4  :  needahug
52  :  needed
3  :  needle
24  :  negative
10  :  negativity
4  :  negligence
6  :  negro
12  :  neighbor
11  :  neighborhood
7  :  neighbour
3  :  neighbourhood
4  :  neil
6  :  neither
3  :  neko
5  :  nemo
5  :  neo
3  :  nepal
16  :  nephew
17  :  nerd
83  :  nervous
35  :  ness
9  :  net
3  :  netanyahu
35  :  netflix
3  :  netflixandchill
8  :  netherlands
18  :  network
6  :  networking
6  :  neutral
3  :  neutrality
3  :  nevada
420  :  never
9  :  neverforget
4  

16  :  promote
3  :  promoted
10  :  promoting
11  :  promotion
3  :  prompt
3  :  pron
19  :  proof
5  :  prop
11  :  propaganda
7  :  proper
5  :  properly
7  :  propey
8  :  prosecco
6  :  prospect
6  :  prosperity
5  :  prostitute
16  :  protect
4  :  protected
6  :  protecting
11  :  protection
6  :  protein
13  :  protest
6  :  protester
15  :  protesting
3  :  protestors
4  :  prototype
181  :  proud
5  :  proudly
3  :  proudmom
12  :  prove
10  :  proved
8  :  proven
12  :  proverb
8  :  provide
4  :  provider
6  :  provides
6  :  providing
4  :  proving
3  :  provoking
3  :  psa
5  :  psalm
3  :  psyched
3  :  psychic
6  :  psycho
5  :  psychological
9  :  psychology
3  :  psychopath
3  :  psychosis
8  :  pt
3  :  ptsd
12  :  pub
36  :  public
3  :  publicity
3  :  publicly
3  :  published
6  :  publishing
3  :  pueobanus
5  :  pueorico
7  :  puff
8  :  pug
3  :  puglife
15  :  pull
39  :  pulse
3  :  pulsefitness
10  :  pulsenightclub
3  :  pulsenightclubshooting
11  :  pulse

72  :  sea
5  :  seal
5  :  sealed
3  :  sean
22  :  search
6  :  searching
22  :  seashepherd
10  :  seaside
148  :  season
5  :  seasonal
3  :  seasoned
24  :  seat
2  :  seated
6  :  seattle
2  :  seaview
4  :  seaworld
2  :  sebastian
3  :  sec
52  :  second
3  :  secondlife
53  :  secret
2  :  secretaryofstate
8  :  section
6  :  secure
4  :  secured
19  :  security
2  :  sedition
2  :  seduced
4  :  seductive
770  :  see
8  :  seed
101  :  seeing
21  :  seek
2  :  seeker
5  :  seeking
8  :  seeklearning
19  :  seem
6  :  seemed
67  :  seems
74  :  seen
4  :  segment
3  :  segregation
3  :  sel
8  :  selca
4  :  select
6  :  selected
5  :  selection
7  :  selena
3  :  selenagomez
72  :  self
3  :  selfcare
5  :  selfharm
4  :  selfharming
4  :  selfhelp
315  :  selfie
20  :  selfies
8  :  selfietime
13  :  selfish
13  :  selflove
6  :  selfporait
3  :  selfrespect
21  :  sell
5  :  seller
15  :  selling
6  :  semester
11  :  semi
4  :  seminar
8  :  semitic
5  :  semitism
3  :  se

6  :  teammate
2  :  teamo
5  :  teamshide
2  :  teamspeak
2  :  teamspirit
12  :  teamsuperjunior
9  :  teamwork
4  :  teapay
175  :  tear
5  :  tease
11  :  teaser
49  :  tech
3  :  techjunkiejh
4  :  technical
7  :  techno
12  :  technology
3  :  ted
4  :  tedatibm
6  :  teddy
5  :  tedtalks
8  :  tee
67  :  teen
10  :  teenager
17  :  teeth
3  :  tel
4  :  teleprompter
3  :  teletubbiesusa
115  :  tell
25  :  telling
6  :  temecula
3  :  temper
36  :  temple
5  :  temporary
21  :  ten
6  :  tend
7  :  tenerife
9  :  tennessee
10  :  tennis
3  :  tense
7  :  tent
5  :  tequila
3  :  teresa
33  :  term
28  :  terrible
7  :  terribly
7  :  terrific
6  :  terrified
20  :  terror
3  :  terrorattack
36  :  terrorism
45  :  terrorist
3  :  terry
51  :  test
5  :  testimonial
4  :  testimony
12  :  testing
33  :  texas
30  :  text
3  :  texting
3  :  tf
88  :  tgif
3  :  tgifriday
3  :  tgifridays
241  :  th
6  :  tha
6  :  thai
25  :  thailand
5  :  thalaivaa
309  :  thank
952  :  thankfu

3  :  wright
22  :  write
25  :  writer
5  :  writerslife
48  :  writing
12  :  written
94  :  wrong
17  :  wrote
26  :  wso
3  :  wt
56  :  wtf
7  :  wth
3  :  wti
4  :  wud
3  :  ww
26  :  wwdc
14  :  wwe
3  :  wwii
5  :  www
3  :  wynonna
3  :  wypipo
3  :  xactaccounts
15  :  xbox
9  :  xboxe
8  :  xboxone
3  :  xboxones
3  :  xc
13  :  xd
15  :  xenophobia
6  :  xenophobic
4  :  xian
6  :  xmas
6  :  xo
28  :  xoxo
43  :  xx
68  :  xxx
4  :  xxxx
32  :  ya
3  :  yaay
4  :  yah
13  :  yall
4  :  yard
5  :  yass
105  :  yay
3  :  yayyy
4  :  yayyyy
6  :  ye
13  :  yea
156  :  yeah
3  :  yeahhh
555  :  year
3  :  yearswithinfinite
3  :  yeezy
3  :  yeezyboost
5  :  yeg
3  :  yegfood
15  :  yelchin
5  :  yell
3  :  yelling
21  :  yellow
3  :  yelp
5  :  yen
19  :  yep
206  :  yes
3  :  yesallwomen
92  :  yesterday
141  :  yet
3  :  yey
3  :  yg
4  :  yield
4  :  ynwa
68  :  yo
61  :  yoga
3  :  yogalove
8  :  yogi
19  :  yolo
47  :  york
4  :  yorkshire
141  :  young
7  :  younger
3  

13  :  summer2016
10  :  summerâ
171  :  sun
156  :  sunday
15  :  sundayfunday
16  :  sundaymorning
51  :  sunny
16  :  sunnyday
33  :  sunset
61  :  sunshine
11  :  super
13  :  sushi
20  :  swag
41  :  sweet
13  :  swim
12  :  swimming
10  :  sydney
13  :  sâ
15  :  tagsforlikes
34  :  tampa
10  :  tan
13  :  tattoo
16  :  tb
130  :  tbt
20  :  tcot
27  :  team
14  :  teambts
12  :  teambtsâ
15  :  tears
38  :  tech
16  :  teen
11  :  teeth
28  :  temple
14  :  terrorism
11  :  texas
77  :  tgif
14  :  thailand
491  :  thankful
36  :  thanks
57  :  thankyou
24  :  the
12  :  theatre
16  :  therapy
12  :  theresistance
10  :  thoughts
20  :  throwback
10  :  throwbackthursday
55  :  thursday
14  :  thursdaythoughts
43  :  time
11  :  tips
45  :  tired
15  :  to
73  :  today
10  :  toddler
19  :  together
16  :  tokyo
10  :  tonight
10  :  tonyawards
20  :  toptags
11  :  toronto
18  :  tragedy
28  :  tragic
17  :  training
125  :  travel
10  :  traveling
17  :  trending
26  :  trip
2

In [10]:
print(type(hashtag_data_features))
print(type(tweets_data_features))
print(hashtag_data_features.shape)
print(tweets_data_features.shape)

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(31961, 1000)
(31961, 10000)


In [None]:
#joining the two numpy arrays to perform the fitting together
final_data_features = np.concatenate((tweets_data_features,hashtag_data_features),axis=1)
print(final_data_features.shape)

In [None]:
#applying RandomForest classifier
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators=100)
forest = forest.fit((total_data_features),train["label"])

In [None]:
#read the test file
test = pd.read_csv("test_tweets_anuFYb8.csv")
num_reviews = len(test["tweet"])
clean_tweet_review = []
clean_hash_review = []
print("cleaning and parsing the data . . . .")
for i in range(num_reviews):
    if((i+1)%3000==0):
        print("working on cleaning the test data")
    temp = test["tweet"][i]
    clean_review = review_tweets(test[temp])
    clean_tweet_review.append(clean_review)
    clean_hash = separate_tweets(temp)
    clean_hash_review.append(clean_hash)
tweet_data_features = vectorizer1.transform(clean_tweet_review)
tweet_data_features = test_data_features.toarray()
hash_data_features = vectorizer2.transform(clean_hash_review)
hash_data_features = hash_data_features.toarray()
result = forest.predict(test_data_features)
#create a DataFrame containing the prediction
output = pd.DataFrame(data={"id":test["id"], "label":result})
#output the total result to a .csv file
output.to_csv("predicted.csv",index=False)