In [4]:
import pandas as pd

#Steps implemented for feature engineering
1. Import the Data
2. Remove the null record from the data

In [5]:
#Reading the input data
tweet_tr=pd.read_csv('train.csv')
tweet_tst=pd.read_csv('test.csv')

In [6]:
tweet_tr.head()

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


In [7]:
tweet_tr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27481 entries, 0 to 27480
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   textID         27481 non-null  object
 1   text           27480 non-null  object
 2   selected_text  27480 non-null  object
 3   sentiment      27481 non-null  object
dtypes: object(4)
memory usage: 858.9+ KB


In [8]:
print(tweet_tr[tweet_tr['text'].isnull()==True])

         textID text selected_text sentiment
314  fdb77c3752  NaN           NaN   neutral


In [9]:
#Deleting the null row of data
tweet_tr=tweet_tr.drop(labels=[314],axis=0)

In [10]:
tweet_tr.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 27480 entries, 0 to 27480
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   textID         27480 non-null  object
 1   text           27480 non-null  object
 2   selected_text  27480 non-null  object
 3   sentiment      27480 non-null  object
dtypes: object(4)
memory usage: 1.0+ MB


In [11]:
#Checking test data for null
tweet_tst.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3534 entries, 0 to 3533
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   textID     3534 non-null   object
 1   text       3534 non-null   object
 2   sentiment  3534 non-null   object
dtypes: object(3)
memory usage: 83.0+ KB


#All records in test set is clean

In [12]:
#Check for hashtags in text and select_text
text_hash=[]
idx=0
for i in tweet_tr['text']:
    if '#' in i:
        text_hash.append(idx)
    idx+=1
print(len(text_hash))
select_text_hash=[]
idx=0
for i in tweet_tst['text']:
    if '#' in i:
        select_text_hash.append(idx)
    idx+=1
print(len(select_text_hash))

542
76


As both the columns 'text' as well as 'select_text' contains # so we will keep the hastag

In [13]:
tweet_tr['sentiment'].value_counts()

neutral     11117
positive     8582
negative     7781
Name: sentiment, dtype: int64

In [14]:
#Converting the tweets to lowercase
tweet_tr['text']=tweet_tr['text'].apply(lambda x: x.lower())
tweet_tst['text']=tweet_tst['text'].apply(lambda x: x.lower())

In [15]:
#Some engineering work to check what numeric data are present in the text field and in which category can be grouped under
import regex as re
#tweet_tr['text']=re.sub('\d+'," ",tweet_tr['text'])
dict={'neutral':0,'positive':0,'negative':0}
for i,j in enumerate(tweet_tr['selected_text']):
    if len(re.findall('\d+',str(j)))>0 :
        if tweet_tr.loc[i,'sentiment'] == 'positive':
            dict['positive']+=1
        elif tweet_tr.loc[i,'sentiment'] == 'negative':
            dict['negative']+=1
        else:
            dict['neutral']+=1
print(dict)

{'neutral': 839, 'positive': 715, 'negative': 602}


In [16]:
#Before creating the feature vector split the data into train and valid set
from sklearn.model_selection import train_test_split
X_train,X_valid=train_test_split(tweet_tr,test_size=0.2,random_state=42)

In [17]:
#Seggregating the data into 3 sets for the classes
X_pos=X_train[X_train['sentiment']=='positive']
X_neg=X_train[X_train['sentiment']=='negative']
X_neutral=X_train[X_train['sentiment']=='neutral']

In [35]:
#word vector for each class
from sklearn.feature_extraction.text import CountVectorizer

cv=CountVectorizer(max_df=0.95, min_df=2,max_features=10000,stop_words='english')

conv_X_train=cv.fit_transform(X_train['text'])

conv_pos=cv.transform(X_pos['text'])
conv_neg=cv.transform(X_neg['text'])
conv_neutral=cv.transform(X_neutral['text'])

In [36]:
conv_pos.toarray().shape

(6894, 8530)

In [38]:
#Converting the word vector into dataframe

train_pos=pd.DataFrame(conv_pos.toarray(),columns=cv.get_feature_names())
train_neg=pd.DataFrame(conv_neg.toarray(),columns=cv.get_feature_names())
train_neutral=pd.DataFrame(conv_neutral.toarray(),columns=cv.get_feature_names())

In [41]:
train_neg.columns

Index(['00', '000', '01', '03', '04', '05', '06', '07', '08', '09',
       ...
       '½6', '½a', '½m', '½n', '½re', '½s', '½t', '½ve', '½you', '½ï'],
      dtype='object', length=8530)

In [47]:
#Creating dictionary for positive, negative and neutral, so normalize the sum it has been divided by the number of rows for each class type
pos_df={}
neg_df={}
neutral_df={}

for k in cv.get_feature_names():
    pos_sum=train_pos[k].sum()
    neg_sum=train_neg[k].sum()
    neutral_sum=train_neutral[k].sum()
    
    pos_df[k]=pos_sum/X_pos.shape[0]
    neg_df[k]=neg_sum/X_neg.shape[0]
    neutral_df[k]=neutral_sum/X_neutral.shape[0]

In [51]:
pos_dict={}
neg_dict={}
neutral_dict={}

for key, value in pos_df.items():
    pos_dict[key]=pos_df[k]-(neutral_df[k]+neg_df[k])
    
for key, value in neg_df.items():
    neg_dict[key]=neg_df[k]-(neutral_df[k]+pos_df[k])
    
for key, value in neutral_df.items():
    neutral_dict[key]=neutral_df[k]-(pos_df[k]+neg_df[k])