## Feature Engineering

* Feature engineering is the process of extracting features from raw data using domain expertise and data mining techniques.
* We must first clean and reshape the data before we can use it to train our model.

In [7]:
!pip install imbalanced-learn
!pip install scikit-learn
!pip install scipy



In [8]:
#packages for general processing of data
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import re
import warnings
# set plot style
sns.set()

#packages for natural language processing
import nltk
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
lemmatizer = WordNetLemmatizer()
import string

#Packages for machine learning algorithms
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

#packages for checking the performance of the models used
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

#packages for web scraping
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re


#packages for balancing our data
from sklearn.utils import resample
import imblearn
from imblearn.over_sampling import SMOTE

In [9]:
df_train = pd.read_csv('./train.csv') # This code imports the train data 
df_test = pd.read_csv('./test.csv') # This code import the test data
sample = pd.read_csv('./sample_submission.csv') #This code imports a sample submission data

In [10]:
df = df_train.copy() # This code makes a copy of the df_train
test_data = df_test.copy() # This code makes a copy of the df_test

In [14]:
#for this purpose, we combine both the test and train data set
#we drop the sentiment feature for compatablity reasons
df_all = pd.concat([df.drop('sentiment', axis = 1),test_data]) 


#we then check for null values in both data sets
df_all.isnull().sum()

message    0
tweetid    0
dtype: int64

In [15]:
df_all.message = df_all.message.str.lower()

In [16]:
df_all.message = df_all.message.str.lower()

#funtions that will perform several tasks
def remove_punctuation(post):
    return ''.join([l for l in post if l not in string.punctuation])

def mbti_lemma(words, lemmatizer):
    return [lemmatizer.lemmatize(word) for word in words]

def extract(m):
    for i in m:
        if i[:4] in 'http':
            try:
                html = urlopen(i)
            except:
                html = 'url_web'
                soup = BeautifulSoup(html)
            try:
                title = soup.title.text
            except AttributeError:
                title = 'url_web'
            except:
                title = 'url_web'
            m.append(title.split())

            
#we remove punctuations
print('removing punctuations')
df_all['message'] = df_all['message'].apply(remove_punctuation)
df_all['message'] = df_all['message'].apply(lambda x:x.split())


#we extract information from the links
print('extracting information from links')
#df_all['message'] = df_all['message'].apply(lambda x: extract(x))           
                       
#we remove any uncaught links           
print('removing links')
pattern_url = r'http[s]?://(?:[A-Za-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9A-Fa-f][0-9A-Fa-f]))+'
subs_url = r''
df_all['message'] = df_all['message'].replace(to_replace = pattern_url, value = subs_url, regex = True)

#we apply lematizers
print('applying lematization')
df_all['message'] = df_all['message'].apply(mbti_lemma, args=(lemmatizer, ))
df_all['message'] = df_all['message'].apply(lambda x: ' '.join(x))

removing punctuations
extracting information from links
removing links
applying lematization


We then transform the data into features for model training

In [17]:
# Let's use the count vectorizer with its default hyperparameters
vect = CountVectorizer(lowercase=True,max_features=8000, stop_words='english',analyzer='word', ngram_range=(1, 3), min_df=4, max_df=0.5)
X_count = vect.fit_transform(df_all['message'].values.astype(str))

#we devide our dataset back to the training and testing set
train = X_count[:15819]
test = X_count[15819:]

#we then devide our data into x and y for machine learning algorithm
warnings.filterwarnings('ignore')
X = train.toarray()
train = pd.DataFrame(X, columns = vect.get_feature_names())
y = df.sentiment
train['y'] = y


#we print the shape of our training and testing data to ensure that we are good
print(f'training data shape is {train.shape}')
print(f'testind data shape is {test.shape}')

training data shape is (15819, 8001)
testind data shape is (10546, 8000)
