In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import tensorflow as tf  # Import TensorFlow library for machine learning tasks.
import tensorflow_hub as hub  # Import TensorFlow Hub for reusable machine learning modules.
import tensorflow_text as text  # Import TensorFlow Text for text processing operations.

In [None]:
import pandas as pd  # Import the Pandas library and alias it as 'pd' for ease of use.

# Read a CSV file named "sample_submission.csv" from the specified directory into a Pandas DataFrame.
# The DataFrame is assigned to the variable 'df'.
df = pd.read_csv("/kaggle/input/nlp-getting-started/sample_submission.csv")

# Display the first 5 rows of the DataFrame 'df'.
df.head(5)

In [None]:
import pandas as pd  # Import the Pandas library and alias it as 'pd' for ease of use.

# Read a CSV file named "test.csv" from the specified directory into a Pandas DataFrame.
# The DataFrame is assigned to the variable 'test_df'.
test_df = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")

# Display the first 5 rows of the DataFrame 'test_df'.
test_df.head(5)

In [None]:
import pandas as pd  # Import the Pandas library and alias it as 'pd' for ease of use.

# Read a CSV file named "train.csv" from the specified directory into a Pandas DataFrame.
# The DataFrame is assigned to the variable 'train_df'.
train_df = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")

# Display the first 10 rows of the DataFrame 'train_df'.
train_df.head(10)

In [None]:
# The info() method provides a concise summary of the DataFrame 'train_df'.
# It displays:
# - The class of the DataFrame (in this case, 'pandas.core.frame.DataFrame').
# - The range index, showing the total number of entries (rows), which starts from 0.
# - Information about each column:
#   - The column name.
#   - The number of non-null values in the column.
#   - The data type of the values in the column ('int64' for integers, 'object' for text, etc.).
# - The memory usage of the DataFrame, indicating how much memory is consumed by the DataFrame's data.
train_df.info()

In [None]:
# Getting unique values in the 'keyword' column
unique_keywords = train_df['keyword'].unique()

# Printing each unique keyword in a separate row
for keyword in unique_keywords:
    print(keyword)

In [None]:
# The value_counts() method applied to the 'keyword' column of the DataFrame 'train_df'
# counts the occurrences of each unique value in that column.
# It returns a Series where:
# - Each unique value in the 'keyword' column is listed.
# - The corresponding count represents how many times each unique value appears in the column.
train_df['keyword'].value_counts()

In [None]:
# Print the length of the DataFrame before removing duplicates
print(len(train_df))

# Drop duplicate rows based on the 'text' column, keeping only the last occurrence of each unique value
train_df = train_df.drop_duplicates('text', keep='last')

# Print the length of the DataFrame after removing duplicates
print(len(train_df))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(15,100))
sns.countplot(data=train_df, y='keyword', hue='target')

In [None]:
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

def clean_text(text):
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    
    # Remove HTML tags if dealing with web data
    text = re.sub(r'<.*?>', '', text)
    
    # Remove special characters and punctuation
    text = re.sub(r'[^\w\s]', '', text)
    
    # Convert text to lowercase
    text = text.lower()
    
    # Tokenize the text
    tokens = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    # Join tokens back into a single string
    clean_text = ' '.join(tokens)
    
    return clean_text

# Apply clean_text function to each entry in the 'text' column of train_df
cleaned_text_column = train_df['text'].apply(clean_text)

# Replace the original 'text' column with the cleaned_text_column
train_df['text'] = cleaned_text_column

train_df['text']

In [None]:
train_df.head(15)

In [None]:
train_df['target'].value_counts()

In [None]:
import pandas as pd
from sklearn.utils import resample

# train_df is DataFrame
# Separate majority and minority classes
majority_class = train_df[train_df['target'] == 0]
minority_class = train_df[train_df['target'] == 1]

# Downsample majority class
downsampled_majority = resample(majority_class,
                                 replace=False,  # sample without replacement
                                 n_samples=len(minority_class),  # match minority class
                                 random_state=42)  # reproducible results

# Combine minority class with downsampled majority class
balanced_df = pd.concat([downsampled_majority, minority_class])

# Now balanced_df contains the balanced dataset with equal samples from each class
balanced_df

In [None]:
from sklearn.model_selection import train_test_split  # Importing train_test_split function from scikit-learn library for splitting dataset.

# Splitting the text data and target labels from DataFrame 'train_df' into training and testing sets:
# - X_train and X_test hold the training and testing sets of input features (text data), respectively.
# - y_train and y_test contain the corresponding training and testing sets of target labels.
# The train_test_split() function is called with the following parameters:
# - train_df['text'].tolist(): The input features, text data from DataFrame 'train_df', converted to a list.
# - train_df['target'].tolist(): The target variable, target labels from DataFrame 'train_df', converted to a list.
# - test_size=0.01: Specifies that 1% of the data will be used for testing, while 99% for training.
# - stratify=train_df['target'].tolist(): Ensures that the class distribution in the target variable is preserved during splitting.
# - random_state=0: Sets the random seed to 0 for reproducibility, ensuring consistent data splits across executions.
X_train, X_test, y_train, y_test = train_test_split(train_df['text'].tolist(),
                                                    train_df['target'].tolist(),
                                                    test_size=0.01,
                                                    stratify=train_df['target'].tolist(),
                                                    random_state=0)

In [None]:
print(X_train[:15])

In [None]:
import tensorflow as tf  # Import TensorFlow library for machine learning tasks.
import tensorflow_hub as hub  # Import TensorFlow Hub for reusable machine learning modules.
import tensorflow_text as text  # Import TensorFlow Text for text processing operations.

In [None]:
import numpy as np

# Load BERT preprocessing and encoding modules
bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4")

def get_sentence_embeddings(df, text_column, batch_size=32):
    # Initialize an empty list to store the embeddings
    embeddings = []
    
    # Extract text data from the specified column in the DataFrame
    sentences = df[text_column].tolist()
    
    # Determine the number of batches
    num_batches = int(np.ceil(len(sentences) / batch_size))
    
    # Process the data in batches
    for i in range(num_batches):
        # Get the start and end indices for the current batch
        start_idx = i * batch_size
        end_idx = min((i + 1) * batch_size, len(sentences))
        
        # Get the sentences for the current batch
        batch_sentences = sentences[start_idx:end_idx]
        
        # Preprocess the sentences using BERT preprocessing module
        preprocessed_text = bert_preprocess(batch_sentences)
        
        # Encode the preprocessed text using BERT encoder module and obtain pooled outputs
        batch_outputs = bert_encoder(preprocessed_text)['pooled_output']
        
        # Append the batch outputs to the list of embeddings
        embeddings.append(batch_outputs)
    
    # Concatenate the embeddings from all batches along the batch axis
    embeddings = np.concatenate(embeddings, axis=0)
    
    return embeddings

# Example usage with a DataFrame
# Assuming train_df is your DataFrame and 'text' is the column containing the text data
embeddings = get_sentence_embeddings(train_df, 'text')
print(embeddings)

In [None]:
import tensorflow as tf

# Define the model architecture
model = tf.keras.Sequential([
    # Input layer (no need to specify input shape as it's implicit from the shape of embeddings)
    tf.keras.layers.Dense(64, activation='relu', input_shape=(embeddings.shape[1],)),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

In [None]:
# Assuming you have already defined embeddings and y_train
from sklearn.model_selection import train_test_split

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(embeddings, train_df['target'], test_size=0.2, random_state=42)

# Compile the model
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Fit the model using the training and validation data
model.fit(X_train, y_train, epochs=100, batch_size=32, validation_data=(X_val, y_val))

In [None]:
print(embeddings.shape)

In [None]:
# Apply clean_text function to each entry in the 'text' column of test_df
cleaned_text_column_test = test_df['text'].apply(clean_text)

# Replace the original 'text' column with the cleaned_text_column
test_df['text'] = cleaned_text_column_test

test_df['text']
test_embeddings = get_sentence_embeddings(test_df, 'text')

In [None]:
probs = model.predict(test_embeddings) 
threshold = 0.4
preds = np.where(probs[:,] > threshold, 1, 0)

In [None]:
submission=pd.read_csv('/kaggle/input/nlp-getting-started/sample_submission.csv')

In [None]:
print("Length of preds array:", len(preds))
print("Length of submission DataFrame index:", len(submission.index))

In [None]:
# Check data alignment between submission DataFrame index and preds array
submission_index = submission.index

# Convert submission index to a list for comparison
submission_index_list = submission_index.tolist()

# Check if all elements of submission index are present in preds array
alignment_check = all(idx in submission_index_list for idx in range(len(preds)))

# Print alignment check result
if alignment_check:
    print("Data alignment check: Submission DataFrame index aligns with preds array.")
else:
    print("Data alignment check: Submission DataFrame index does not align with preds array.")

In [None]:
# Print information about the submission DataFrame
print("Submission DataFrame Info:")
print(submission.info())

# Display the first few rows of the submission DataFrame
print("\nFirst few rows of the submission DataFrame:")
print(submission.head())

In [None]:
# Debugging
print("Length of preds array:", len(preds))
print("Length of submission DataFrame index:", len(submission.index))

# Check if there are any missing indices in submission DataFrame
missing_indices = [idx for idx in range(len(preds)) if idx not in submission.index]
if missing_indices:
    print("Missing indices in submission DataFrame:", missing_indices)

# Check if there are any extra indices in submission DataFrame
extra_indices = [idx for idx in submission.index if idx >= len(preds)]
if extra_indices:
    print("Extra indices in submission DataFrame:", extra_indices)

# Print some rows of submission DataFrame for further inspection
print("\nSample rows of the submission DataFrame:")
print(submission.sample(5))

In [None]:
# Remove extra indices from submission DataFrame
submission = submission.iloc[:len(preds)]

# Verify lengths after removing extra indices
print("Length of preds array:", len(preds))
print("Length of submission DataFrame after removing extra indices:", len(submission.index))

In [None]:
submission["target"]=preds

In [None]:
submission.to_csv('submission.csv', index=False, header=False)

In [None]:
print(len(submission))