# Data Cleaning (not lemmatized)

In [1]:
import re
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
import pandas as pd

[nltk_data] Downloading package wordnet to /Users/paula/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
train = pd.read_csv('Corona_NLP_train.csv',encoding='latin1')  
test = pd.read_csv('Corona_NLP_test.csv',encoding='latin1')  
print("Train shape : ",train.shape)
print("Test shape : ",test.shape)

Train shape :  (41157, 6)
Test shape :  (3798, 6)


In [29]:
41157+3798

44955

#### Divide into three Sentiment Classes
Encode the categories with numbers and create three sentiment categories: Positive, Neutral and Negative.

In [3]:
train['Sentiment'] = train['Sentiment'].map({'Extremely Negative':0,'Negative':0,'Neutral':1,'Positive':2,'Extremely Positive':2})
test['Sentiment'] = test['Sentiment'].map({'Extremely Negative':0,'Negative':0,'Neutral':1,'Positive':2,'Extremely Positive':2})

#### Create function for pre-processing
The lemmatizer of this function is  specifically utilized for the data that is input to models employing count-based techniques for feature extraction (BoW, PoS, Tf-Idf). The "data" directory includes the non-lemmatized data, while the "data lemmatized" directory includes the same data that has been lemmatized.

In [4]:
# Function to Clean the Tweet column
def clean_tweet(input_tweets): 
    
    # Convert to lowercase
    tweets = input_tweets.str.lower()
    
    # Remove URLs
    tweets = tweets.replace(r"http\S+", " ", regex=True)
    tweets = tweets.replace(r"http", " ", regex=True)
    
    # Remove usernames
    tweets = tweets.replace(r"@\S+", " ", regex=True)
    
    # Replace "@" with "at"
    tweets = tweets.replace(r"@", "at", regex=True)
    
    # Remove non-alphanumeric characters except apostrophes and spaces
    tweets = tweets.replace(r"[^\w\s']"," ", regex=True)
    
    # Remove non-ASCII characters
    tweets = tweets.replace(r'[^\x00-\x7F]+'," ", regex=True)
    
    # Remove digits
    tweets = tweets.replace(r'\d+',"", regex=True)
    
    # Tokenize the tweets
    #tweets = tweets.apply(lambda x: re.findall(r'\b\w+\b', x))
    
    # Lemmatize the tokens
    #lemmatizer = WordNetLemmatizer()
    #tweets = tweets.apply(lambda x: [lemmatizer.lemmatize(word) for word in x])
    
    # Join the lemmatized tokens back into strings
    #tweets = tweets.apply(lambda x: ' '.join(x))
    
    return tweets

#### Clean tweets and remove stopwords
Apply the `clean_tweet` function to the original tweets, add the clean tweets to a new column and remove nltk stopwords.

In [5]:
# Place clean data in new column 
train['CleanTweet'] = clean_tweet(train['OriginalTweet'])
test['CleanTweet'] = clean_tweet(test['OriginalTweet'])

# Load stopwords
stop_words = set(stopwords.words('english'))

# Remove stopwords
train['CleanTweet'] = train['CleanTweet'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
test['CleanTweet'] = test['CleanTweet'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

#### Remove irrelevant columns and rows

In [6]:
# Drop the columns which we don't need
train = train.drop(['OriginalTweet','UserName','ScreenName','Location','TweetAt'], axis = 1)
test = test.drop(['OriginalTweet','UserName','ScreenName','Location','TweetAt'], axis = 1)

In [7]:
# Current training set and test set shape
print("Train shape : ",train.shape)
print("Test shape : ",test.shape)

Train shape :  (41157, 2)
Test shape :  (3798, 2)


In [8]:
# Remove non-string values and empty string values
train = train[train['CleanTweet'].apply(lambda x: isinstance(x, str) and x != '')]
test = test[test['CleanTweet'].apply(lambda x: isinstance(x, str) and x != '')]
print("Train shape : ",train.shape)
print("Test shape : ",test.shape)
# Removed 17 values that are non-string or empty strings

Train shape :  (41130, 2)
Test shape :  (3796, 2)


In [9]:
# Reset the index
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)

#### Split into 80% Train, 10% Validation, and 10% Test

In [10]:
# Test set 90%
X_test = test["CleanTweet"]
y_test = test["Sentiment"]

# Train set 10%
X_train = train["CleanTweet"]
y_train = train['Sentiment']   

# Further split X_train into 80% Train and 10% Validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, train_size=37334)

In [11]:
print(len(X_train)) 
print(len(y_train))

print(len(X_val))  
print(len(y_val))

print(len(X_test)) 
print(len(X_test))

37334
37334
3796
3796
3796
3796


### Save the data

In [12]:
# Save the training data 
X_train.to_csv("data/X_train.csv")
y_train.to_csv("data/y_train.csv")

# Save the validation data 
X_val.to_csv("data/X_val.csv")
y_val.to_csv("data/y_val.csv")

# Save the test data 
X_test.to_csv("data/X_test.csv")
y_test.to_csv("data/y_test.csv")

### Load the data

In [13]:
# Load the training data 
X_train = pd.read_csv("data/X_train.csv", index_col=0).reset_index(drop=True)['CleanTweet']
y_train = pd.read_csv("data/y_train.csv", index_col=0).reset_index(drop=True)['Sentiment']

# Load the validation data 
X_val = pd.read_csv("data/X_val.csv", index_col=0).reset_index(drop=True)['CleanTweet']
y_val = pd.read_csv("data/y_val.csv", index_col=0).reset_index(drop=True)['Sentiment']

# Load the test data 
X_test = pd.read_csv("data/X_test.csv", index_col=0).reset_index(drop=True)['CleanTweet']
y_test = pd.read_csv("data/y_test.csv", index_col=0).reset_index(drop=True)['Sentiment']