# Tweet Sentiment Extraction - Baseline Model

This notebook implements a baseline approach for the tweet sentiment extraction competition.

## Strategy
1. Simple rule-based approach based on sentiment
2. For positive/negative: extract the most relevant word/phrase
3. For neutral: use the entire text

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import re
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# Load data
train_df = pd.read_csv('/home/data/train.csv')
test_df = pd.read_csv('/home/data/test.csv')
sample_submission = pd.read_csv('/home/data/sample_submission.csv')

print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")
print(f"Sample submission shape: {sample_submission.shape}")

# Display basic info
print("\nTrain data info:")
print(train_df.head())
print(f"\nSentiment distribution:")
print(train_df['sentiment'].value_counts())

# Check for missing values
print(f"\nMissing values in train:")
print(train_df.isnull().sum())

In [None]:
# Define Jaccard score function for evaluation
def jaccard(str1, str2):
    a = set(str1.lower().split())
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c)) if (len(a) + len(b) - len(c)) > 0 else 0.0

# Test the function
print("Testing Jaccard function:")
print(f"jaccard('very good', 'very good'): {jaccard('very good', 'very good')}")
print(f"jaccard('very good', 'good'): {jaccard('very good', 'good')}")
print(f"jaccard('very good', 'bad'): {jaccard('very good', 'bad')}")

In [None]:
# Simple baseline: 
# - For neutral sentiment: use the entire text
# - For positive/negative: extract the most relevant part

def clean_text(text):
    """Clean text by removing extra spaces and quotes"""
    if pd.isna(text):
        return ""
    text = str(text).strip()
    # Remove surrounding quotes if present
    if text.startswith('"') and text.endswith('"'):
        text = text[1:-1]
    return text

# Clean the text columns
train_df['text'] = train_df['text'].apply(clean_text)
train_df['selected_text'] = train_df['selected_text'].apply(clean_text)
test_df['text'] = test_df['text'].apply(clean_text)

print("Sample cleaned data:")
print(train_df[['text', 'selected_text', 'sentiment']].head())

In [None]:
# Analyze what selected_text looks like for each sentiment
print("Analyzing selected_text patterns by sentiment:")

for sentiment in ['positive', 'negative', 'neutral']:
    print(f"\n=== {sentiment.upper()} ===")
    subset = train_df[train_df['sentiment'] == sentiment]
    
    # Show some examples
    print("Examples:")
    for i in range(min(3, len(subset))):
        row = subset.iloc[i]
        print(f"  Text: '{row['text']}'")
        print(f"  Selected: '{row['selected_text']}'")
        print()
    
    # Calculate statistics
    text_lengths = subset['text'].str.len()
    selected_lengths = subset['selected_text'].str.len()
    
    print(f"Avg text length: {text_lengths.mean():.1f}")
    print(f"Avg selected length: {selected_lengths.mean():.1f}")
    print(f"Selected is full text: {(subset['text'] == subset['selected_text']).mean():.1%}")

In [None]:
# Create a simple baseline model
# For neutral: use full text
# For positive/negative: use some simple heuristics

def baseline_predict(text, sentiment):
    """Simple baseline prediction"""
    if pd.isna(text):
        return ""
    
    text = str(text).strip()
    
    # For neutral sentiment, use the entire text
    if sentiment == 'neutral':
        return text
    
    # For positive/negative, try to extract key phrases
    # Simple approach: look for sentiment words and extract around them
    
    words = text.split()
    if len(words) <= 3:
        return text
    
    # For very short texts, return the whole thing
    if len(text) < 20:
        return text
    
    # Otherwise, return a reasonable subset
    # Try to extract the first sentence or first part
    sentences = re.split(r'[.!?]+', text)
    if sentences and sentences[0].strip():
        first_sentence = sentences[0].strip()
        if len(first_sentence) >= 10:
            return first_sentence
    
    # Fallback: return first few words
    return ' '.join(words[:min(5, len(words))])

# Test on training data
train_df['baseline_pred'] = train_df.apply(lambda row: baseline_predict(row['text'], row['sentiment']), axis=1)

# Calculate Jaccard score
scores = []
for idx, row in train_df.iterrows():
    score = jaccard(row['selected_text'], row['baseline_pred'])
    scores.append(score)

train_df['jaccard_score'] = scores
baseline_score = np.mean(scores)

print(f"Baseline Jaccard score: {baseline_score:.4f}")

# Show some predictions vs actuals
print("\nSample predictions:")
sample_idx = np.random.choice(len(train_df), 5, replace=False)
for idx in sample_idx:
    row = train_df.iloc[idx]
    print(f"Sentiment: {row['sentiment']}")
    print(f"Text: '{row['text']}'")
    print(f"Actual: '{row['selected_text']}'")
    print(f"Predicted: '{row['baseline_pred']}'")
    print(f"Score: {row['jaccard_score']:.3f}")
    print("---")

In [None]:
# Make predictions on test set
test_df['selected_text'] = test_df.apply(lambda row: baseline_predict(row['text'], row['sentiment']), axis=1)

# Create submission file
submission = test_df[['textID', 'selected_text']].copy()

# Ensure the selected_text is properly quoted
submission['selected_text'] = '"' + submission['selected_text'].astype(str) + '"'

print("Sample submission:")
print(submission.head())

# Save submission
submission.to_csv('/home/submission/submission.csv', index=False)
print(f"\nSubmission saved to /home/submission/submission.csv")
print(f"Shape: {submission.shape}")

In [None]:
# Calculate cross-validation score for more robust evaluation
from sklearn.model_selection import KFold

# Simple CV to get a better estimate
def cv_score(df, n_splits=5):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    cv_scores = []
    
    for train_idx, val_idx in kf.split(df):
        val_df = df.iloc[val_idx]
        
        # Predict on validation set
        pred_scores = []
        for _, row in val_df.iterrows():
            pred = baseline_predict(row['text'], row['sentiment'])
            score = jaccard(row['selected_text'], pred)
            pred_scores.append(score)
        
        cv_scores.append(np.mean(pred_scores))
    
    return np.mean(cv_scores), np.std(cv_scores)

cv_mean, cv_std = cv_score(train_df)
print(f"CV Score: {cv_mean:.4f} ± {cv_std:.4f}")

# Final results
print(f"\n{'='*50}")
print(f"BASELINE MODEL RESULTS")
print(f"{'='*50}")
print(f"Training score: {baseline_score:.4f}")
print(f"CV score: {cv_mean:.4f} ± {cv_std:.4f}")
print(f"Test predictions: {len(test_df)} samples")
print(f"Submission saved: /home/submission/submission.csv")