In [1]:
# Install required packages
!pip install pandas numpy matplotlib seaborn plotly nltk scikit-learn wordcloud

# Download NLTK data
import nltk
nltk.download('stopwords')
nltk.download('punkt')

print("All packages installed successfully!")

Defaulting to user installation because normal site-packages is not writeable
Collecting plotly
  Downloading plotly-6.4.0-py3-none-any.whl.metadata (8.5 kB)
Collecting nltk
  Downloading nltk-3.9.2-py3-none-any.whl.metadata (3.2 kB)
Collecting wordcloud
  Downloading wordcloud-1.9.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.4 kB)
Collecting narwhals>=1.15.1 (from plotly)
  Downloading narwhals-2.11.0-py3-none-any.whl.metadata (11 kB)
Collecting click (from nltk)
  Downloading click-8.3.0-py3-none-any.whl.metadata (2.6 kB)
Collecting regex>=2021.8.3 (from nltk)
  Downloading regex-2025.11.3-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.5/40.5 kB[0m [31m866.1 kB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
Downloading plotly-6.4.0-py3-none-any.whl (9.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m63.8

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/u.sk336075/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /home/u.sk336075/nltk_data...


All packages installed successfully!


[nltk_data]   Unzipping tokenizers/punkt.zip.


In [11]:
# Checking for the new files
import time
import os

print("Waiting for new files to be uploaded...")
print("Expected files: train.csv, test.csv, sample_submission.csv")

# Check for files in data folder
data_files = os.listdir('../data')
print("Current files in data folder:", data_files)

# Wait a moment for upload to complete
time.sleep(2)

# Check again
data_files = os.listdir('../data')
print("Files after waiting:", data_files)

if 'train.csv' in data_files:
    print("\nLoading complete training dataset...")
    train_df = pd.read_csv('../data/train.csv')
    print("Complete training dataset loaded successfully!")
    print("Dataset shape:", train_df.shape)
    print("Columns:", train_df.columns.tolist())
    
    # Quick preview
    print("\nFirst 2 rows with text:")
    print(train_df[['text', 'target']].head(2))
else:
    print("\n train.csv not found yet")
    print("Please make sure train.csv is uploaded to ../data/ folder")

Waiting for new files to be uploaded...
Expected files: train.csv, test.csv, sample_submission.csv
Current files in data folder: ['test.csv', 'train.csv', 'sample_submission.csv', '.ipynb_checkpoints']
Files after waiting: ['test.csv', 'train.csv', 'sample_submission.csv', '.ipynb_checkpoints']

Loading complete training dataset...
Complete training dataset loaded successfully!
Dataset shape: (7613, 5)
Columns: ['id', 'keyword', 'location', 'text', 'target']

First 2 rows with text:
                                                text  target
0  Our Deeds are the Reason of this #earthquake M...       1
1             Forest fire near La Ronge Sask. Canada       1


In [12]:
# Basic dataset inspection
print("COMPLETE DATASET ANALYSIS")
print("=" * 40)

print("Dataset dimensions:", train_df.shape)
print("Number of samples:", len(train_df))
print("Number of features:", train_df.shape[1])

print("\nColumn details:")
print(train_df.dtypes)

print("\nMissing values analysis:")
missing_data = train_df.isnull().sum()
missing_percent = (train_df.isnull().sum() / len(train_df)) * 100
missing_summary = pd.DataFrame({
    'Missing Count': missing_data,
    'Missing Percentage': missing_percent
})
print(missing_summary)

print("\nTarget distribution:")
target_counts = train_df['target'].value_counts()
target_percent = train_df['target'].value_counts(normalize=True) * 100
target_summary = pd.DataFrame({
    'Count': target_counts,
    'Percentage': target_percent
})
print(target_summary)

print("\nSample disaster tweets (target=1):")
disaster_tweets = train_df[train_df['target'] == 1]['text'].head(3)
for i, tweet in enumerate(disaster_tweets, 1):
    print(f"{i}. {tweet}")

print("\nSample non-disaster tweets (target=0):")
non_disaster_tweets = train_df[train_df['target'] == 0]['text'].head(3)
for i, tweet in enumerate(non_disaster_tweets, 1):
    print(f"{i}. {tweet}")

COMPLETE DATASET ANALYSIS
Dataset dimensions: (7613, 5)
Number of samples: 7613
Number of features: 5

Column details:
id           int64
keyword     object
location    object
text        object
target       int64
dtype: object

Missing values analysis:
          Missing Count  Missing Percentage
id                    0            0.000000
keyword              61            0.801261
location           2533           33.272035
text                  0            0.000000
target                0            0.000000

Target distribution:
        Count  Percentage
target                   
0        4342   57.034021
1        3271   42.965979

Sample disaster tweets (target=1):
1. Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all
2. Forest fire near La Ronge Sask. Canada
3. All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected

Sample non-disaster tweets (target=0):
1. What's up man?
2. I love 

In [14]:
# Fixing NLTK download issues and creating robust preprocessing
import nltk

print("Downloading required NLTK data")
try:
    nltk.download('punkt_tab')
    print("punkt_tab downloaded successfully")
except:
    print("Using alternative approach")

# Creating a simpler preprocessing function that doesn't rely on advanced tokenization
def clean_tweet_text_simple(text):
    """
    Clean tweet text using simple string operations
    """
    if isinstance(text, str):
        # Convert to lowercase
        text = text.lower()
        
        # Remove URLs
        text = re.sub(r'http\S+', '', text)
        
        # Remove user mentions
        text = re.sub(r'@\w+', '', text)
        
        # Remove hashtags but keep the text
        text = re.sub(r'#', '', text)
        
        # Remove special characters and digits, keep only letters and spaces
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        
        # Remove extra whitespace
        text = ' '.join(text.split())
        
        # Simple stopwords removal using string operations
        stop_words = {'a', 'an', 'the', 'and', 'or', 'but', 'is', 'are', 'was', 'were', 
                     'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'as', 'it'}
        words = text.split()
        filtered_words = [word for word in words if word not in stop_words]
        text = ' '.join(filtered_words)
        
    return text

print("Simplified preprocessing function created successfully")

# Testing the simplified preprocessing
test_tweet = "Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all http://example.com"
print("\nTesting simplified preprocessing:")
print("Original tweet:", test_tweet)
print("After cleaning:", clean_tweet_text_simple(test_tweet))

# Testing on actual dataset samples
print("\nTesting on actual dataset samples:")
sample_tweets = train_df['text'].head(2)
for i, tweet in enumerate(sample_tweets):
    print(f"Sample {i+1}:")
    print("  Original:", tweet)
    print("  Cleaned:", clean_tweet_text_simple(tweet))
    print()

Downloading required NLTK data
punkt_tab downloaded successfully
Simplified preprocessing function created successfully

Testing simplified preprocessing:
Original tweet: Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all http://example.com
After cleaning: our deeds reason this earthquake may allah forgive us all

Testing on actual dataset samples:
Sample 1:
  Original: Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all
  Cleaned: our deeds reason this earthquake may allah forgive us all

Sample 2:
  Original: Forest fire near La Ronge Sask. Canada
  Cleaned: forest fire near la ronge sask canada



[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/u.sk336075/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [15]:
# Applying preprocessing to the entire dataset
print("Starting preprocessing of entire dataset")

# Create a copy of the original dataframe
processed_df = train_df.copy()

print("Applying text cleaning to all tweets...")
processed_df['cleaned_text'] = processed_df['text'].apply(clean_tweet_text_simple)

print("Handling missing values...")
processed_df['keyword'] = processed_df['keyword'].fillna('unknown')
processed_df['location'] = processed_df['location'].fillna('unknown')

print("Creating text length features...")
processed_df['text_length'] = processed_df['cleaned_text'].apply(lambda x: len(str(x).split()))
processed_df['char_length'] = processed_df['cleaned_text'].apply(lambda x: len(str(x)))

print("Preprocessing completed successfully")
print("New dataset shape:", processed_df.shape)
print("New columns:", processed_df.columns.tolist())

# Display preprocessing results
print("\nPreprocessing results summary:")
print("Original text sample:")
print(train_df['text'].iloc[0])
print("Cleaned text sample:")
print(processed_df['cleaned_text'].iloc[0])
print("Text length:", processed_df['text_length'].iloc[0])
print("Character length:", processed_df['char_length'].iloc[0])

print("\nMissing values after preprocessing:")
print(processed_df.isnull().sum())

print("\nDataset info after preprocessing:")
print(processed_df.info())

Starting preprocessing of entire dataset
Applying text cleaning to all tweets...
Handling missing values...
Creating text length features...
Preprocessing completed successfully
New dataset shape: (7613, 8)
New columns: ['id', 'keyword', 'location', 'text', 'target', 'cleaned_text', 'text_length', 'char_length']

Preprocessing results summary:
Original text sample:
Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all
Cleaned text sample:
our deeds reason this earthquake may allah forgive us all
Text length: 10
Character length: 57

Missing values after preprocessing:
id              0
keyword         0
location        0
text            0
target          0
cleaned_text    0
text_length     0
char_length     0
dtype: int64

Dataset info after preprocessing:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            7613 non

In [16]:
# Splitting data into training and validation sets
from sklearn.model_selection import train_test_split

print("Splitting data into training and validation sets")

# Preparing features and target
X = processed_df['cleaned_text']  # Using cleaned text as features
y = processed_df['target']        # Target variable

# Performing the split (80% training, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42, 
    stratify=y  # Maintaining same target distribution in both sets
)

print("Data splitting completed successfully")
print(f"Training set size: {len(X_train)} samples ({len(X_train)/len(X)*100:.1f}%)")
print(f"Validation set size: {len(X_val)} samples ({len(X_val)/len(X)*100:.1f}%)")

print("\nTraining set target distribution:")
train_target_counts = y_train.value_counts()
train_target_percent = y_train.value_counts(normalize=True) * 100
print(pd.DataFrame({
    'Count': train_target_counts,
    'Percentage': train_target_percent
}))

print("\nValidation set target distribution:")
val_target_counts = y_val.value_counts()
val_target_percent = y_val.value_counts(normalize=True) * 100
print(pd.DataFrame({
    'Count': val_target_counts,
    'Percentage': val_target_percent
}))

# Saving the processed data
print("\nSaving processed datasets...")
processed_df.to_csv('../data/processed_train.csv', index=False)

# Creating separate files for train and validation splits
train_split = processed_df.loc[X_train.index]
val_split = processed_df.loc[X_val.index]

train_split.to_csv('../data/train_split.csv', index=False)
val_split.to_csv('../data/val_split.csv', index=False)

print("Processed data saved successfully")
print("Files created:")
print("- ../data/processed_train.csv (full processed dataset)")
print("- ../data/train_split.csv (training split)")
print("- ../data/val_split.csv (validation split)")

Splitting data into training and validation sets
Data splitting completed successfully
Training set size: 6090 samples (80.0%)
Validation set size: 1523 samples (20.0%)

Training set target distribution:
        Count  Percentage
target                   
0        3473   57.027915
1        2617   42.972085

Validation set target distribution:
        Count  Percentage
target                   
0         869   57.058437
1         654   42.941563

Saving processed datasets...
Processed data saved successfully
Files created:
- ../data/processed_train.csv (full processed dataset)
- ../data/train_split.csv (training split)
- ../data/val_split.csv (validation split)


In [17]:
# Final summary for Notebook 1: Data Preprocessing
print("NOTEBOOK 1: DATA PREPROCESSING - COMPLETED")
print("=" * 50)

print("PROCESSING SUMMARY:")
print(f"✓ Original dataset: {len(train_df)} samples, {train_df.shape[1]} columns")
print(f"✓ Processed dataset: {len(processed_df)} samples, {processed_df.shape[1]} columns")
print(f"✓ Missing values handled: {processed_df.isnull().sum().sum()} remaining")
print(f"✓ Text cleaning applied to all {len(processed_df)} tweets")
print(f"✓ New features created: text_length, char_length")
print(f"✓ Data split: {len(X_train)} training, {len(X_val)} validation samples")
print(f"✓ Class balance maintained in both splits")

print("\nFILES GENERATED:")
print("✓ ../data/processed_train.csv - Full processed dataset")
print("✓ ../data/train_split.csv - Training split (80%)")
print("✓ ../data/val_split.csv - Validation split (20%)")

print("\nNEXT STEPS:")
print("→ Proceed to Notebook 2: EDA and Visualization")
print("→ Analyze text length distributions")
print("→ Create word frequency visualizations")
print("→ Generate class distribution charts")

print("\nDATA PREPROCESSING PHASE COMPLETED SUCCESSFULLY")

NOTEBOOK 1: DATA PREPROCESSING - COMPLETED
PROCESSING SUMMARY:
✓ Original dataset: 7613 samples, 5 columns
✓ Processed dataset: 7613 samples, 8 columns
✓ Missing values handled: 0 remaining
✓ Text cleaning applied to all 7613 tweets
✓ New features created: text_length, char_length
✓ Data split: 6090 training, 1523 validation samples
✓ Class balance maintained in both splits

FILES GENERATED:
✓ ../data/processed_train.csv - Full processed dataset
✓ ../data/train_split.csv - Training split (80%)
✓ ../data/val_split.csv - Validation split (20%)

NEXT STEPS:
→ Proceed to Notebook 2: EDA and Visualization
→ Analyze text length distributions
→ Create word frequency visualizations
→ Generate class distribution charts

DATA PREPROCESSING PHASE COMPLETED SUCCESSFULLY
