In [None]:
import csv
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
import nltk
nltk.download('punkt')
nltk.download('stopwords')
import nltk
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()

# preprocess text
def preprocess_text(text):
    # Remove Pre and Post Spaces
    text = str(text).strip()
    # Lower case
    text = str(text).lower()
    # New Line Characters with spaces
    text = re.sub(r"\n", r" ", text)
    # Tokenize the sentence
    word_tokens = word_tokenize(text)
    # Remove the punctuation and special characters from each individual word
    cleaned_text = []
    for word in word_tokens:
        cleaned_text.append("".join([char for char in word if char.isalnum()]))

    # Specify the stop words list
    stop_words = stopwords.words('english')
    # Remove the stopwords and words containing less than 2 characters
    text_tokens = [word for word in cleaned_text if (len(word) > 2) and (word not in stop_words)]
    # Lemmatize
    text = [lemmatizer.lemmatize(word) for word in text_tokens]

    return text

# Function to read CSV file and return the specified columns
def read_csv_file(filename, columns, headers, encoding='utf-8'):
    data = []
    data.append(headers)
    with open(filename, 'r', encoding=encoding) as file:
        csv_reader = csv.reader(file)
        next(csv_reader)  # Skip the header row
        for row in csv_reader:
            # Preprocess text column
            preprocessed_text = preprocess_text(row[5])
            selected_row = [row[i] for i in columns]
            selected_row[1] = preprocessed_text  # Replace original text with preprocessed text
            data.append(selected_row)
    return data

# Define the column indices to extract
columns_to_extract = [0, 5]
custom_headers = ["target", "text"]

# Reading the CSV file
data = read_csv_file('/content/training.1600000.processed.noemoticon.csv', columns_to_extract, custom_headers, encoding='latin-1')

# Extracting features (text) and labels (target)
features = [row[1] for row in data[1:]]  # Exclude headers
labels = [int(row[0]) for row in data[1:]]  # Exclude headers and convert to integers

# Splitting the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)


print("Training set size:", len(x_train))
print("Testing set size:", len(x_test))


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Training set size: 1279999
Testing set size: 320000


In [None]:
from sklearn.preprocessing import OneHotEncoder
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder

print("Training Data Samples:")
for i in range(5):
    print("Sample:", i+1)
    print("Text:", x_train[i])
    print("Label:", y_train[i])
    print()

print("Testing Data Samples:")
for i in range(5):
    print("Sample:", i+1)
    print("Text:", x_test[i])
    print("Label:", y_test[i])
    print()

# Initialize LabelEncoder
encoder = LabelEncoder()

# Fit the encoder on the combined labels and transform both training and testing labels
y_combined = y_train + y_test
encoder.fit(y_combined)

y_train_encoded = encoder.transform(y_train)
y_test_encoded = encoder.transform(y_test)

# Convert integer labels to one-hot encoded format
num_classes = len(encoder.classes_)
y_train_onehot = to_categorical(y_train_encoded, num_classes)
y_test_onehot = to_categorical(y_test_encoded, num_classes)

Training Data Samples:
Sample: 1
Text: ['otavolimed', 'second', 'get', 'back', 'haha']
Label: 4

Sample: 2
Text: ['race', 'life', 'awesome']
Label: 4

Sample: 3
Text: ['good', 'clothes', 'otherwise', 'could', 'even', 'awkward', 'already', 'lol']
Label: 4

Sample: 4
Text: ['late', 'work', 'week', 'even', 'take', 'metro', 'guess', 'bus', 'affected', 'crash']
Label: 0

Sample: 5
Text: ['selestina118', 'incredible', 'people', 'need', 'know', 'worry']
Label: 4

Testing Data Samples:
Sample: 1
Text: ['nkluvr4eva', 'poor', 'little', 'dumpling', 'holmdel', 'vids', 'really', 'trying', 'hope', 'dont', 'try', 'hard', 'tonight']
Label: 0

Sample: 2
Text: ['bed', 'got', 'wake', 'hella', 'early', 'tomorrow', 'morning']
Label: 0

Sample: 3
Text: ['havent', 'able', 'listen', 'yet', 'speaker', 'busted']
Label: 0

Sample: 4
Text: ['remembers', 'solving', 'relatively', 'big', 'equation', 'two', 'unknown', 'total', 'pain', 'butt']
Label: 0

Sample: 5
Text: ['ate', 'much', 'feel', 'sick']
Label: 0



In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from tensorflow.keras.utils import to_categorical


max_words = 10000
max_len = 100
embedding_dim=100

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(x_train)

x_train_seq = tokenizer.texts_to_sequences(x_train)
x_test_seq = tokenizer.texts_to_sequences(x_test)

# Pad sequences
x_train_pad = pad_sequences(x_train_seq, maxlen=max_len)
x_test_pad = pad_sequences(x_test_seq, maxlen=max_len)


In [None]:
model = Sequential([
    Embedding(max_words, embedding_dim, input_length=max_len),
    Conv1D(128, 5, activation='relu'),
    GlobalMaxPooling1D(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(num_classes, activation='softmax')
])

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(x_train_pad, y_train_onehot, epochs=2, batch_size=132,validation_data=(x_test_pad, y_test_onehot))

# Evaluate the model
loss, accuracy = model.evaluate(x_test_pad, y_test_onehot)
print("Test Loss:", loss)
print("Test Accuracy:", accuracy)


In [None]:
from keras.models import Sequential
from keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout

# Define the model
filter_sizes = [3, 5, 7]

for filter_size in filter_sizes:
    model = Sequential([
        Embedding(max_words,100, input_length=max_len),
        Conv1D(128, filter_size, activation='relu'),
        GlobalMaxPooling1D(),
        Dense(128, activation='relu'),
        Dropout(0.5),
        Dense(num_classes, activation='softmax')
    ])

    # Compile the model
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

    # Train the model
    history = model.fit(x_train_pad, y_train_onehot, epochs=2, batch_size=132, validation_data=(x_test_pad, y_test_onehot))

    # Evaluate the model
    loss, accuracy = model.evaluate(x_test_pad, y_test_onehot)
    print(f"Test Loss (Filter Size {filter_size}):", loss)
    print(f"Test Accuracy (Filter Size {filter_size}):", accuracy)


Epoch 1/2
Epoch 2/2
Test Loss (Filter Size 3): 0.4531400799751282
Test Accuracy (Filter Size 3): 0.784375011920929
Epoch 1/2
Epoch 2/2
1280/9697 [==>...........................] - ETA: 16:48 - loss: 0.4422 - accuracy: 0.7925