In [1]:
import numpy as np
import pandas as pd
import os
import re
import emoji

from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize



In [2]:

df=pd.read_csv('nlptrain.csv')
df.head()

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


# Missing Values

In [3]:
def missingValues(data):
    total=data.isnull().sum().sort_values(ascending=False)
    percentage=(total/len(data))
    return pd.concat([total,percentage],axis=1,keys=['Total','Percentage'])

missingValues(df)

Unnamed: 0,Total,Percentage
selected_text,1,3.6e-05
text,1,3.6e-05
sentiment,0,0.0
textID,0,0.0


In [4]:
df=df.dropna()

In [5]:
def countValues(df,col):
    counts=df[col].value_counts().dropna()
    percentage=round(counts*100/np.sum(counts.values),2)
    return pd.concat([counts,percentage],axis=1,keys=['Total','Percentage'])
    
countValues(df,'sentiment')

Unnamed: 0,Total,Percentage
neutral,11117,40.45
positive,8582,31.23
negative,7781,28.32


In [6]:
def uniqueValues(df,col):
    uniques=np.unique(df[col],return_counts=True)
    return pd.DataFrame({'Title':uniques[0],
                         'Counts':uniques[1]})


uniqueValues(df,'sentiment')

Unnamed: 0,Title,Counts
0,negative,7781
1,neutral,11117
2,positive,8582


In [7]:
def duplicateValues(df):
    dups=[]
    columns=df.columns
    for i in df.columns:
        dups.append(sum(df[i].duplicated()))
    return pd.concat([pd.Series(columns),pd.Series(dups)],axis=1,
                    keys=['Columns','Counts'])
duplicateValues(df)

Unnamed: 0,Columns,Counts
0,textID,0
1,text,0
2,selected_text,5017
3,sentiment,27477


In [8]:
df.describe()

Unnamed: 0,textID,text,selected_text,sentiment
count,27480,27480,27480,27480
unique,27480,27480,22463,3
top,5a1653eae7,_Henrie you are amazingg. replyy! it`s a dream...,good,neutral
freq,1,1,199,11117


In [9]:
def find_url(string):
    text=re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',string)
    return ''.join(text)
    
    
find_url('I love spending time at https://www.kaggle.com/')

'https://www.kaggle.com/'

In [10]:
df['url']=df['text'].apply(lambda x: find_url(x))

In [11]:
df.head(10)['url']

0                                  
1                                  
2                                  
3                                  
4                                  
5    http://www.dothebouncy.com/smf
6                                  
7                                  
8                                  
9                                  
Name: url, dtype: object

In [12]:
def removeEmoji(text):
    text=text.split()
    text=[i for i in text if i.isalpha()]
    text=' '.join(text)
    return text

In [13]:
removeEmoji("I love ⚽ very much 😁")


'I love very much'

In [14]:
sentence="Its all about \U0001F600 face"
removeEmoji(sentence)

'Its all about face'

In [15]:
def findEmail(text):
    line=re.findall(r'[\w\.-]+@[\w\.-]+',text)
    return ','.join(line)
    
    

findEmail("My gmail is abc99@gmail.com")

'abc99@gmail.com'

In [16]:
df['email']=df['text'].apply(lambda x: findEmail(x))

In [17]:
(df['email']=='').value_counts()

True    27480
Name: email, dtype: int64

In [18]:
def findHash(text):
    line=re.findall(r'(?<=#)\w+',text)
    return ' '.join(line)



findHash('Rishav is #name Paudel is #surname and @student')

'name surname'

In [19]:
df['hash']=df['text'].apply(lambda x: findHash(x))

In [20]:
def findMentions(text):
    line=re.findall(r'(?<=@)\w+',text)
    return ' '.join(line)

df['mentions']=df['text'].apply(lambda x: findMentions(x))


In [21]:
df[df['mentions']!=''].head(4)

Unnamed: 0,textID,text,selected_text,sentiment,url,email,hash,mentions
220,e70d294d95,@_Ra_Ra_ your not alone...i need coffee too.,your not alone...i need coffee too.,neutral,,,,_Ra_Ra_
988,a732d0a8eb,nite nite twitts i wish u all a happy sunday i...,happy,positive,,,,_agressiva23
1043,9a03198e38,need some more enthu movie guys for that. Rig...,need some more enthu movie guys for that. Righ...,neutral,,,,_Anshul
1083,e9c337f756,@_TWEE haha thanks to you that`s my new word f...,thanks to you th,positive,,,,_TWEE


In [22]:
def findNumber(text):
    line=re.findall(r'[0-9]+',text)
    return " ".join(line)

df['Numbers']=df['text'].apply(lambda x: findNumber(x))

In [23]:
def find_phone_number(text):
    line=re.findall(r"[0-9]{10}\b",text)
    return " ".join(line)
find_phone_number('Rishav 9805398393 paudel 8897065512')

'9805398393 8897065512'

In [24]:
df['Ph Numbers']=df['text'].apply(lambda x: find_phone_number(x))

In [25]:
def find_year(text):
    line=re.findall(r"19[0-9]+|20[0-9]+|17[0-9]+|18[0-9]+",text)
    return ' '.join(line)

sent='india got independence in 1994 and india is 2012'
find_year(sent)

'1994 2012'

In [26]:
df['Year']=df['text'].apply(lambda x: find_year(x))

In [27]:
def stopWords(text):
    stop_words=stopwords.words('english')
    words=[i for i in text.split() if i in stop_words]
    return ' '.join(words)
    
    
    
    
    
stopWords('This is a sample text off a nice the is')

'is a off a the is'

In [28]:
def Ngrms_top(corpus,ngram_range,n=None):
    vec=CountVectorizer(stop_words='english',ngram_range=ngram_range).fit(corpus)
    
    bow=vec.transform(corpus)
    
    sum_w=bow.sum(axis=0)
    words_freq=[(word,sum_w[0,idx]) for word,idx in vec.vocabulary_.items()]
    words_freq=sorted(words_freq,key=lambda x: x[1],reverse=True)
    total_list=words_freq[:n]
    
    
    return pd.DataFrame(total_list,columns=['text','count'])
    
    
    
Ngrms_top(df['text'],(1,1),n=10)

Unnamed: 0,text,count
0,just,2278
1,day,2115
2,good,1578
3,like,1353
4,http,1247
5,work,1150
6,today,1147
7,love,1145
8,going,1103
9,got,1085


In [29]:
Ngrms_top(df['text'],(2,2),n=10)

Unnamed: 0,text,count
0,mother day,358
1,twitpic com,334
2,http twitpic,332
3,mothers day,279
4,happy mother,275
5,just got,219
6,happy mothers,199
7,http bit,180
8,bit ly,180
9,good morning,176


In [30]:
Ngrms_top(df['text'],(3,3),n=10)

Unnamed: 0,text,count
0,http twitpic com,332
1,happy mother day,268
2,happy mothers day,195
3,http bit ly,180
4,http tinyurl com,166
5,http plurk com,109
6,star wars day,73
7,http blip fm,73
8,happy star wars,56
9,just got home,53


Repetitive character

In [31]:
def rep(text):
    grp = text.group(0)
    if len(grp) > 1:
        return grp[0:1] # can change the value here on repetition

In [32]:
def unique_char(rep,sentence):
    convert = re.sub(r'(\w)\1+', rep, sentence) 
    return convert

In [33]:
sentence="heyyy this is loong textttt sooon"
unique_char(rep,sentence)

'hey this is long text son'

In [34]:
df['unique_char']=df['text'].apply(lambda x : unique_char(rep,x))


In [35]:
df.head()

Unnamed: 0,textID,text,selected_text,sentiment,url,email,hash,mentions,Numbers,Ph Numbers,Year,unique_char
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,,,,,,,,"I`d have responded, if I were going"
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,,,,,,,,So SAD I wil mis you here in San Diego!!!
2,088c60f138,my boss is bullying me...,bullying me,negative,,,,,,,,my bos is bulying me...
3,9642c003ef,what interview! leave me alone,leave me alone,negative,,,,,,,,what interview! leave me alone
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,,,,,,,,"Sons of ****, why couldn`t they put them on t..."


In [36]:
def find_dollar(text):
    line=re.findall(r'\$\dp+(?:\.\d+)?',text)
    return " ".join(line)

# \$ - dollar sign followed by
# \d+ one or more digits
# (?:\.\d+)? - decimal which is optional