In [1]:
!pip install vaderSentiment

import pandas as pd
import numpy as np
import os
import re
import emoji

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import cohen_kappa_score
from sklearn.utils.class_weight import compute_class_weight
from PIL import Image, ImageStat
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.resnet50 import preprocess_input
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from imblearn.over_sampling import SMOTE
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from tensorflow.keras.callbacks import EarlyStopping

Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl.metadata (572 bytes)
Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.0 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2


In [2]:
# File paths
print("Loading file paths...")
TRAIN_CSV = '/kaggle/input/deep-learning-for-computer-vision-and-nlp-2024-10/train.csv'
TEST_CSV = '/kaggle/input/deep-learning-for-computer-vision-and-nlp-2024-10/test.csv'
TRAIN_IMAGES_PATH = '/kaggle/input/deep-learning-for-computer-vision-and-nlp-2024-10/images/images/train'
TEST_IMAGES_PATH = '/kaggle/input/deep-learning-for-computer-vision-and-nlp-2024-10/images/images/test'

# Loading data
print("Loading data...")
train_data = pd.read_csv(TRAIN_CSV)
test_data = pd.read_csv(TEST_CSV)
print("Training dataset size:", train_data.shape)
print("Test dataset size:", test_data.shape)

# Filling missing values in the 'Description' column and converting to strings
train_data['Description'] = train_data['Description'].fillna('').astype(str)
test_data['Description'] = test_data['Description'].fillna('').astype(str)

Loading file paths...
Loading data...
Training dataset size: (6431, 3)
Test dataset size: (1891, 2)


In [3]:
def calculate_brightness(image_path):
    try:
        img = Image.open(image_path).convert('L')  # Convert to grayscale
        stat = ImageStat.Stat(img)
        return stat.mean[0]  # Average brightness
    except:
        return 0

# Function to extract metadata from images
def extract_image_features(pet_id, path):
    img_folder = f'{path}/{pet_id}'
    if not os.path.exists(img_folder):
        return pd.Series([0, 0, 0])  # No images

    image_files = os.listdir(img_folder)
    num_images = len(image_files)
    avg_brightness = np.mean([calculate_brightness(f'{img_folder}/{img_file}') for img_file in image_files])

    return pd.Series([1 if num_images > 0 else 0, num_images, avg_brightness])

In [4]:
# Add image metadata to the training dataset
print("Processing train images for metadata...")
train_data[['has_image', 'num_images', 'avg_brightness']] = train_data['PetID'].apply(
    lambda x: extract_image_features(x, TRAIN_IMAGES_PATH))

# Add image metadata to the test dataset
print("Processing test images for metadata...")
test_data[['has_image', 'num_images', 'avg_brightness']] = test_data['PetID'].apply(
    lambda x: extract_image_features(x, TEST_IMAGES_PATH))

Processing train images for metadata...
Processing test images for metadata...


In [5]:
# Function to extract age from the description
def extract_age(description):
    age_search = re.search(r'\b\d{1,2}\s?(months?|years?)\b', description.lower())
    if age_search:
        age = age_search.group(0)
        # Convert age to months or years
        if 'year' in age:
            return int(re.search(r'\d+', age).group(0)) * 12  # Convert years to months
        elif 'month' in age:
            return int(re.search(r'\d+', age).group(0))  # Keep months
    return np.nan

# Function to extract breed from the description
def extract_breed(description):
    breed_search = re.search(r'\b(mixed breed|poodle|labrador|bulldog|cat|dog)\b', description.lower())
    if breed_search:
        return breed_search.group(0)
    return 'unknown'

# Function to extract health status from the description
def extract_health_status(description):
    health_search = re.search(r'\b(healthy|vaccinated|neutered|sick)\b', description.lower())
    if health_search:
        return health_search.group(0)
    return 'unknown'

# Function to count emojis in the text
def count_emojis(text):
    return len([char for char in text if char in emoji.EMOJI_DATA])

# Sentiment analysis using VADER
analyzer = SentimentIntensityAnalyzer()

def extract_sentiment(description):
    sentiment = analyzer.polarity_scores(description)
    return pd.Series([sentiment['pos'], sentiment['neu'], sentiment['neg'], sentiment['compound']])

In [6]:
# Add new features to the training dataset
train_data['Age'] = train_data['Description'].apply(extract_age)
train_data['Breed'] = train_data['Description'].apply(extract_breed)
train_data['Health_Status'] = train_data['Description'].apply(extract_health_status)
train_data['emoji_count'] = train_data['Description'].apply(count_emojis)

# Add new features to the test dataset
test_data['Age'] = test_data['Description'].apply(extract_age)
test_data['Breed'] = test_data['Description'].apply(extract_breed)
test_data['Health_Status'] = test_data['Description'].apply(extract_health_status)
test_data['emoji_count'] = test_data['Description'].apply(count_emojis)

# Add sentiment analysis features (positive, neutral, negative, compound)
train_data[['sentiment_pos', 'sentiment_neu', 'sentiment_neg', 'sentiment_compound']] = train_data['Description'].apply(
    extract_sentiment)
test_data[['sentiment_pos', 'sentiment_neu', 'sentiment_neg', 'sentiment_compound']] = test_data['Description'].apply(
    extract_sentiment)

In [7]:
# Function to calculate ratios of positive and negative sentiment words
def extract_sentiment_ratios(description):
    sentiment = analyzer.polarity_scores(description)
    total = sentiment['pos'] + sentiment['neg'] + sentiment['neu']
    pos_ratio = sentiment['pos'] / total if total > 0 else 0
    neg_ratio = sentiment['neg'] / total if total > 0 else 0
    return pd.Series([pos_ratio, neg_ratio])

# Function to extract features from an image using ResNet50
def extract_image_features_resnet(pet_id, path, model):
    img_folder = f'{path}/{pet_id}'
    if not os.path.exists(img_folder):
        return np.zeros((2048,))  # ResNet50 output size

    image_files = os.listdir(img_folder)
    img_path = f'{img_folder}/{image_files[0]}'  # Use the first image

    try:
        img = image.load_img(img_path, target_size=(224, 224))  # Normalize image
        img_data = image.img_to_array(img)
        img_data = np.expand_dims(img_data, axis=0)
        img_data = preprocess_input(img_data)  # Prepare for ResNet50
        features = model.predict(img_data)
        return features.flatten()  # Convert to 1D
    except:
        return np.zeros((2048,))    

In [8]:
# Add sentiment ratio features to the training dataset
train_data[['pos_ratio', 'neg_ratio']] = train_data['Description'].apply(extract_sentiment_ratios)

# Add sentiment ratio features to the test dataset
test_data[['pos_ratio', 'neg_ratio']] = test_data['Description'].apply(extract_sentiment_ratios)

# Create binary feature for missing age
train_data['Age_missing'] = train_data['Age'].isna().astype(int)
test_data['Age_missing'] = test_data['Age'].isna().astype(int)

# Replace missing ages with a special value (-1 to indicate unknown)
train_data['Age'] = train_data['Age'].fillna(-1)
test_data['Age'] = test_data['Age'].fillna(-1)

# Encode categorical features (Breed, Health_Status) using OneHotEncoder
encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
train_breed_health = encoder.fit_transform(train_data[['Breed', 'Health_Status']])
test_breed_health = encoder.transform(test_data[['Breed', 'Health_Status']])

# Process text data using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
train_text_data = vectorizer.fit_transform(train_data['Description'].fillna(''))
test_text_data = vectorizer.transform(test_data['Description'].fillna(''))

# Transfer learning: ResNet50 for image feature extraction
resnet = ResNet50(weights='imagenet', include_top=False, pooling='avg')



Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5
[1m94765736/94765736[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 0us/step


In [9]:
# Extract ResNet50 features for training images
print("Extracting ResNet50 features for training images...")
train_image_features = np.array(
    [extract_image_features_resnet(pet_id, TRAIN_IMAGES_PATH, resnet) for pet_id in train_data['PetID']])

# Extract ResNet50 features for test images
print("Extracting ResNet50 features for test images...")
test_image_features = np.array(
    [extract_image_features_resnet(pet_id, TEST_IMAGES_PATH, resnet) for pet_id in test_data['PetID']])

Extracting ResNet50 features for training images...
Extracting ResNet50 features for test images...


In [10]:
# Concatenate all features into a single feature matrix
train_features = np.hstack([
    train_text_data.toarray(),
    train_data[['has_image', 'num_images', 'avg_brightness', 'Age', 'emoji_count', 'Age_missing',
                'sentiment_pos', 'sentiment_neu', 'sentiment_neg', 'sentiment_compound', 'pos_ratio',
                'neg_ratio']].values,
    train_breed_health,
    train_image_features
])

test_features = np.hstack([
    test_text_data.toarray(),
    test_data[['has_image', 'num_images', 'avg_brightness', 'Age', 'emoji_count', 'Age_missing',
               'sentiment_pos', 'sentiment_neu', 'sentiment_neg', 'sentiment_compound', 'pos_ratio',
               'neg_ratio']].values,
    test_breed_health,
    test_image_features
])

# Standardizing the features
scaler = StandardScaler()
X_train = scaler.fit_transform(train_features)
X_val = scaler.transform(test_features)

# Split training data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(train_features, train_data['AdoptionSpeed'], test_size=0.2,
                                                  random_state=42)

# Handle class imbalance using SMOTE
print("Applying SMOTE to handle class imbalance...")
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

Applying SMOTE to handle class imbalance...


In [11]:
# Build a fully connected neural network for classification
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weight_dict = dict(enumerate(class_weights))

print("Building the model...")
model = Sequential()
model.add(Dense(512, activation='relu', input_shape=(X_train.shape[1],)))
model.add(Dropout(0.5))
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dense(1, activation='linear'))

# Compile the model
model.compile(optimizer='adam', loss='mse', metrics=['mae'])

# Set early stopping criteria to avoid overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Train the model
print("Training the model...")
model.fit(X_train_balanced, y_train_balanced, epochs=50, batch_size=32, validation_data=(X_val, y_val),
              class_weight=class_weight_dict)

# Evaluate the model on test data
print("Evaluating the model...")
y_val_pred = model.predict(X_val)
y_val_pred_rounded = np.round(y_val_pred).astype(int)
y_val_pred_rounded = np.clip(y_val_pred_rounded, 1, 4)

# Calculate Cohen's Kappa score to evaluate classification performance
kappa = cohen_kappa_score(y_val, y_val_pred_rounded, weights='quadratic')
print(f"Cohen's Kappa score: {kappa}")

# Make predictions on train data
predictions = model.predict(test_features)
predictions_rounded = np.round(predictions).astype(int)
predictions_rounded = np.clip(predictions_rounded, 1, 4)

Building the model...
Training the model...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/50


I0000 00:00:1729770253.167170      71 service.cc:145] XLA service 0x7fd7b8004500 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1729770253.167222      71 service.cc:153]   StreamExecutor device (0): Tesla T4, Compute Capability 7.5
I0000 00:00:1729770253.167225      71 service.cc:153]   StreamExecutor device (1): Tesla T4, Compute Capability 7.5


[1m 63/213[0m [32m━━━━━[0m[37m━━━━━━━━━━━━━━━[0m [1m0s[0m 2ms/step - loss: 4.1379 - mae: 1.5225

I0000 00:00:1729770256.785512      71 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 22ms/step - loss: 2.7266 - mae: 1.2315 - val_loss: 1.4560 - val_mae: 1.0132
Epoch 2/50
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 1.0847 - mae: 0.8370 - val_loss: 1.3586 - val_mae: 0.9673
Epoch 3/50
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.9145 - mae: 0.7208 - val_loss: 1.6800 - val_mae: 0.9373
Epoch 4/50
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.7341 - mae: 0.6344 - val_loss: 1.4180 - val_mae: 0.9721
Epoch 5/50
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.5249 - mae: 0.5559 - val_loss: 1.4570 - val_mae: 0.9752
Epoch 6/50
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.7249 - mae: 0.5465 - val_loss: 1.4159 - val_mae: 0.9538
Epoch 7/50
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.4

In [12]:
# Ensure predictions_rounded is flattened correctly
predictions_rounded = predictions_rounded.flatten()

# Convert PetID to a list
pet_ids = test_data['PetID'].tolist()

# Debugging: Print shapes and types
print("Shape of PetID:", len(pet_ids), type(pet_ids))
print("Shape of predictions_rounded:", predictions_rounded.shape, type(predictions_rounded))

# Check for dimensions
if len(pet_ids) != len(predictions_rounded):
    raise ValueError("Length mismatch: PetID and predictions_rounded must be of the same length.")

# Create the submission DataFrame
submission = pd.DataFrame({
    'PetID': pet_ids,
    'AdoptionSpeed': predictions_rounded
})

# Save to CSV
submission.to_csv('submission.csv', index=False, header=False)  # Set header=False to match the format you want
print("Predictions saved successfully.")

Shape of PetID: 1891 <class 'list'>
Shape of predictions_rounded: (1891,) <class 'numpy.ndarray'>
Predictions saved successfully.
