In [None]:
!pip install vaderSentiment

import pandas as pd
import numpy as np
import os
import re

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import cohen_kappa_score
from sklearn.utils.class_weight import compute_class_weight
from PIL import Image, ImageStat
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.resnet50 import preprocess_input
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from imblearn.over_sampling import SMOTE
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from tensorflow.keras.callbacks import EarlyStopping
from google.colab import drive
drive.mount('/content/drive')


Collecting emoji
  Downloading emoji-2.14.0-py3-none-any.whl.metadata (5.7 kB)
Downloading emoji-2.14.0-py3-none-any.whl (586 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m586.9/586.9 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emoji
Successfully installed emoji-2.14.0
Mounted at /content/drive


In [24]:
# File paths
print("Loading file paths...")

project_dir = '/content/drive/My Drive/Neoversity/Colab_Notebooks/Deep_Learning/final_project/'

TRAIN_CSV = project_dir + 'train.csv'
TEST_CSV = project_dir + 'test.csv'
TRAIN_IMAGES_PATH = project_dir+ 'images/train/'
TEST_IMAGES_PATH = project_dir+ 'images/test/'

print("Loading data...")
train_data = pd.read_csv(TRAIN_CSV)
test_data = pd.read_csv(TEST_CSV)
print("Training dataset size:", train_data.shape)
print("Test dataset size:", test_data.shape)

Loading file paths...
Loading data...
Training dataset size: (6431, 3)
Test dataset size: (1891, 2)


In [25]:
# Filling missing values in the 'Description' column and converting to strings
train_data['Description'] = train_data['Description'].fillna('').astype(str)
test_data['Description'] = test_data['Description'].fillna('').astype(str)

def calculate_brightness(img_path):
    # Placeholder function for brightness calculation
    img = image.load_img(img_path, target_size=(224, 224))  # Example size
    img_data = image.img_to_array(img)
    brightness = np.mean(img_data)  # Simple brightness calculation
    return brightness

def extract_image_features(pet_id, path):
    # Search for images matching the pet_id pattern
    image_files = [f for f in os.listdir(path) if f.startswith(f'{pet_id}-') and f.endswith('.jpg')]

    num_images = len(image_files)

    if num_images == 0:
        return pd.Series([0, 0, 0])  # No images

    avg_brightness = np.mean([calculate_brightness(os.path.join(path, img_file)) for img_file in image_files])

    return pd.Series([1 if num_images > 0 else 0, num_images, avg_brightness])

# Add image metadata to the training dataset
print("Processing train images for metadata...")
train_data[['has_image', 'num_images', 'avg_brightness']] = train_data['PetID'].apply(
    lambda x: extract_image_features(x, TRAIN_IMAGES_PATH))

# Add image metadata to the test dataset
print("Processing test images for metadata...")
test_data[['has_image', 'num_images', 'avg_brightness']] = test_data['PetID'].apply(
    lambda x: extract_image_features(x, TEST_IMAGES_PATH))

Processing train images for metadata...
Processing test images for metadata...


In [26]:
# Functions to extract features from descriptions
def extract_age(description):
    age_search = re.search(r'\b\d{1,2}\s?(months?|years?)\b', description.lower())
    if age_search:
        age = age_search.group(0)
        if 'year' in age:
            return int(re.search(r'\d+', age).group(0)) * 12  # Convert years to months
        elif 'month' in age:
            return int(re.search(r'\d+', age).group(0))  # Keep months
    return np.nan

def extract_breed(description):
    breed_search = re.search(r'\b(Affenpinscher|Afghan Hound|Airedale Terrier|Akbash|Akita|Alaskan Malamute|American Bulldog|American Eskimo Dog|American Hairless Terrier|American Staffordshire Terrier|American Water Spaniel|Anatolian Shepherd|Appenzell Mountain Dog|Australian Cattle Dog/Blue Heeler|Australian Kelpie|Australian Shepherd|Australian Terrier|Basenji|Basset Hound|Beagle|Bearded Collie|Beauceron|Bedlington Terrier|Belgian Shepherd Dog Sheepdog|Belgian Shepherd Laekenois|Belgian Shepherd Malinois|Belgian Shepherd Tervuren|Bernese Mountain|Dog|Bichon Frise|Black and Tan Coonhound|Black Labrador Retriever|Black Mouth Cur|Black Russian Terrier|Bloodhound|Blue Lacy|Bluetick Coonhound|Boerboel|Bolognese|Border Collie|Border Terrier|Borzoi|Boston Terrier|Bouvier des Flanders|Boxer|Boykin Spaniel|Briard|Brittany Spaniel|Brussels Griffon|Bull Terrier|Bullmastiff|Cairn Terrier|Canaan Dog|Cane Corso Mastiff|Carolina Dog|Catahoula Leopard Dog|Cattle Dog|Caucasian Sheepdog |Cavalier King Charles Spaniel|Chesapeake Bay Retriever|Chihuahua|Chinese Crested Dog|Chinese Foo Dog|Chinook|Chocolate Labrador Retriever|Chow Chow|Cirneco dell|Etna|Clumber Spaniel|Cockapoo|Cocker Spaniel|Collie|Coonhound|Corgi|Coton de Tulear|Curly-Coated Retriever|Dachshund|Dalmatian|Dandi Dinmont Terrier|Doberman Pinscher|Dogo Argentino|Dogue de Bordeaux|Dutch Shepherd|English Bulldog|English Cocker Spaniel|English Coonhound|English Pointer|English Setter|English Shepherd|English Springer Spaniel|English Toy Spaniel|Entlebucher|Eskimo Dog|Feist|Field Spaniel|Fila Brasileiro|Finnish Lapphund|Finnish Spitz|Flat-coated Retriever|Fox Terrier|Foxhound|French Bulldog|Galgo Spanish Greyhound|German Pinscher|German Shepherd Dog|German Shorthaired Pointer|German Spitz|German Wirehaired Pointer|Giant Schnauzer|Glen of Imaal|Terrier|Golden Retriever|Gordon Setter|Great Dane|Great Pyrenees|Greater Swiss Mountain Dog|Greyhound|Harrier|Havanese|Hound|Hovawart|Husky|Ibizan Hound|Illyrian Sheepdog|Irish Setter|Irish Terrier|Irish Water Spaniel|Irish Wolfhound|Italian Greyhound|Italian Spinone|Jack Russell Terrier|Jack Russell Terrier (Parson Russell Terrier)|Japanese Chin|Jindo|Kai Dog|Karelian Bear Dog|Keeshond|Kerry Blue Terrier|Kishu|Klee Kai|Komondor|Kuvasz|Kyi Leo|Labrador Retriever|Lakeland Terrier|Lancashire Heele|Leonberger|Lhasa Apso|Lowchen|Maltese|Manchester Terrier|Maremma Sheepdog|Mastiff|McNab|Miniature Pinscher|Mountain Cur|Mountain Dog|Munsterlander|Neapolitan Mastiff|New Guinea Singing Dog|Newfoundland Dog|Norfolk Terrier|Norwegian Buhund|Norwegian Elkhound|Norwegian Lundehund|Norwich Terrier|Nova Scotia Duck-Tolling|Retriever|Old English Sheepdog|Otterhound|Papillon|Patterdale Terrier (Fell Terrier)|Pekingese|Peruvian Inca Orchid|Petit Basset Griffon Vendeen|Pharaoh Hound|Pit Bull Terrier|Plott Hound|Podengo Portugueso|Pointer|Polish Lowland Sheepdog|Pomeranian|Poodle|Portuguese Water Dog|Presa Canario|Pug|Puli|Pumi|Rat Terrier|Redbone Coonhound|Retriever|Rhodesian Ridgeback|Rottweiler|Saint Bernard|Saluki|Samoyed|Sarplaninac|Schipperke|Schnauzer|Scottish Deerhound|Scottish Terrier Scottie|Sealyham Terrier|Setter|Shar Pei|Sheep Dog|Shepherd|Shetland Sheepdog Sheltie|Shiba Inu|Shih Tzu|Siberian Husky|Silky Terrier|Skye Terrier|Sloughi|Smooth Fox Terrier|South Russian Ovtcharka|Spaniel|Spitz|Staffordshire Bull Terrier|Standard Poodle|Sussex Spaniel|Swedish Vallhund|Terrier|Thai Ridgeback|Tibetan Mastiff|Tibetan Spaniel|Tibetan Terrier|Tosa Inu|Toy Fox Terrier|Treeing Walker Coonhound|Vizsla|Weimaraner|Welsh Corgi|Welsh Springer Spaniel|Welsh Terrier|West Highland White Terrier Westie|Wheaten Terrier|Whippet|White German Shepherd|Wire Fox Terrier|Wire-haired Pointing Griffon|Wirehaired Terrier|Xoloitzcuintle/Mexican Hairless|Yellow Labrador Retriever|Yorkshire Terrier Yorkie|Mixed Breed|Abyssinian|American Curl|American Shorthair|American Wirehair|Applehead Siamese|Balinese|Bengal|Birman|Bobtail|Bombay|British Shorthair|Burmese|Burmilla|Calico|Canadian Hairless|Chartreux|Chausie|Chinchilla|Cornish Rex|Cymric|Devon Rex|Dilute Calico|Dilute Tortoiseshell|Domestic Long Hair|Domestic Medium Hair|Domestic Short Hair|Egyptian Mau|Exotic Shorthair|Extra-Toes Cat (Hemingway Polydactyl)|Havana|Himalayan|Japanese Bobtail|Javanese|Korat|LaPerm|Maine Coon|Manx|Munchkin|Nebelung|Norwegian Forest Cat|Ocicat|Oriental Long Hair|Oriental Short Hair|Oriental Tabby|Persian|Pixie-Bob|Ragamuffin|Ragdoll|Russian Blue|Scottish Fold|Selkirk Rex|Siamese|Siberian|Silver|Singapura|Snowshoe|Somali|Sphynx (hairless cat)|Tabby|Tiger|Tonkinese|Torbie|Tortoiseshell|Turkish Angora|Turkish Van|Tuxedo|cat|dog)\b', description.lower())
    if breed_search:
        return breed_search.group(0)
    return 'unknown'

def count_emojis(text):
    return len([char for char in text if char in emoji.EMOJI_DATA])

def extract_gender(description):
    if re.search(r'\bMale\b', description, re.IGNORECASE):
        return 1
    elif re.search(r'\bFemale\b', description, re.IGNORECASE):
        return 2
    else:
        return 0

def extract_vacination(description):
    if re.search(r'\bvaccinated\b', description, re.IGNORECASE):
        return 1
    elif re.search(r'\bnot vaccinated\b', description, re.IGNORECASE):
        return 2
    else:
        return 0

def extract_dewormed_status(description):
    if re.search(r'\bdewormed\b', description, re.IGNORECASE):
        return 1
    elif re.search(r'\bnot dewormed\b', description, re.IGNORECASE):
        return 2
    else:
        return 3

def extract_sterilization_status(description):
    if re.search(r'\bspayed\b|neutered', description, re.IGNORECASE):
        return 1
    elif re.search(r'\bnot spayed\b|\bnot neutered\b', description, re.IGNORECASE):
        return 2
    else:
        return 0

def extract_health_status(description):
    if re.search(r'\bhealthy\b', description, re.IGNORECASE):
        return 1
    elif re.search(r'\bminor injury\b', description, re.IGNORECASE):
        return 2
    elif re.search(r'\bserious injury\b', description, re.IGNORECASE):
        return 3
    else:
        return 4

# Sentiment analysis using VADER
analyzer = SentimentIntensityAnalyzer()

def extract_sentiment(description):
    sentiment = analyzer.polarity_scores(description)
    return pd.Series([sentiment['pos'], sentiment['neu'], sentiment['neg'], sentiment['compound']])

In [27]:
# Add new features to the training dataset
train_data['Age'] = train_data['Description'].apply(extract_age)
train_data['Breed'] = train_data['Description'].apply(extract_breed)
train_data['Gender'] = train_data['Description'].apply(extract_gender)
train_data['Vaccination'] = train_data['Description'].apply(extract_vacination)
train_data['Dewormed'] = train_data['Description'].apply(extract_dewormed_status)
train_data['Health'] = train_data['Description'].apply(extract_health_status)
train_data['Sterilization'] = train_data['Description'].apply(extract_sterilization_status)

# Add new features to the test dataset
test_data['Age'] = test_data['Description'].apply(extract_age)
test_data['Breed'] = test_data['Description'].apply(extract_breed)
test_data['Gender'] = test_data['Description'].apply(extract_gender)
test_data['Vaccination'] = test_data['Description'].apply(extract_vacination)
test_data['Dewormed'] = test_data['Description'].apply(extract_dewormed_status)
test_data['Health'] = test_data['Description'].apply(extract_health_status)
test_data['Sterilization'] = test_data['Description'].apply(extract_sterilization_status)

# Add sentiment analysis features
train_data[['sentiment_pos', 'sentiment_neu', 'sentiment_neg', 'sentiment_compound']] = train_data['Description'].apply(
    extract_sentiment)
test_data[['sentiment_pos', 'sentiment_neu', 'sentiment_neg', 'sentiment_compound']] = test_data['Description'].apply(
    extract_sentiment)

In [28]:
# Function to calculate ratios of positive and negative sentiment words
def extract_sentiment_ratios(description):
    sentiment = analyzer.polarity_scores(description)
    total = sentiment['pos'] + sentiment['neg'] + sentiment['neu']
    pos_ratio = sentiment['pos'] / total if total > 0 else 0
    neg_ratio = sentiment['neg'] / total if total > 0 else 0
    return pd.Series([pos_ratio, neg_ratio])

def extract_sentiment_ratios(description):
    sentiment = analyzer.polarity_scores(description)
    total = sentiment['pos'] + sentiment['neg'] + sentiment['neu']
    pos_ratio = sentiment['pos'] / total if total > 0 else 0
    neg_ratio = sentiment['neg'] / total if total > 0 else 0
    return pd.Series([pos_ratio, neg_ratio])

def extract_image_features_resnet(pet_id, path, model):
    # Search for images matching the pet_id pattern
    image_files = [f for f in os.listdir(path) if f.startswith(f'{pet_id}-') and f.endswith('.jpg')]

    if len(image_files) == 0:
        return np.zeros((2048,))  # ResNet50 output size

    # Use the first image
    img_path = os.path.join(path, image_files[0])

    try:
        img = image.load_img(img_path, target_size=(224, 224))  # Normalize image
        img_data = image.img_to_array(img)
        img_data = np.expand_dims(img_data, axis=0)
        img_data = preprocess_input(img_data)  # Prepare for ResNet50
        features = model.predict(img_data)
        return features.flatten()  # Convert to 1D
    except:
        return np.zeros((2048,))

In [29]:
# Add sentiment ratio features to the training dataset
train_data[['pos_ratio', 'neg_ratio']] = train_data['Description'].apply(extract_sentiment_ratios)

# Add sentiment ratio features to the test dataset
test_data[['pos_ratio', 'neg_ratio']] = test_data['Description'].apply(extract_sentiment_ratios)

# Replace missing ages with a special value (-1 to indicate unknown)
train_data['Age'] = train_data['Age'].fillna(-1)
test_data['Age'] = test_data['Age'].fillna(-1)

train_data.to_csv(project_dir + 'train_data2.csv', index=False)
# Encode categorical features (Breed, Health_Status)
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
train_breed_health = encoder.fit_transform(train_data[['Breed', 'Gender', 'Vaccination', 'Dewormed', 'Health', 'Sterilization']])
test_breed_health = encoder.transform(test_data[['Breed', 'Gender', 'Vaccination', 'Dewormed', 'Health', 'Sterilization']])

# Process text data using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
train_text_data = vectorizer.fit_transform(train_data['Description'].fillna(''))
test_text_data = vectorizer.transform(test_data['Description'].fillna(''))

# Transfer learning: ResNet50 for image feature extraction
resnet = ResNet50(weights='imagenet', include_top=False, pooling='avg')

# Extract ResNet50 features for training images
print("Extracting ResNet50 features for training images...")
train_image_features = np.array(
    [extract_image_features_resnet(pet_id, TRAIN_IMAGES_PATH, resnet) for pet_id in train_data['PetID']])

# Extract ResNet50 features for test images
print("Extracting ResNet50 features for test images...")
test_image_features = np.array(
    [extract_image_features_resnet(pet_id, TEST_IMAGES_PATH, resnet) for pet_id in test_data['PetID']])

Extracting ResNet50 features for training images...
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 199ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 307ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 295ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 283ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 287ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 321ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 322ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 208ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 199ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 288ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 467ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 580ms/step
[

In [30]:
# Concatenate all features into a single feature matrix
train_features = np.hstack([
    train_text_data.toarray(),
    train_data[['has_image', 'num_images', 'avg_brightness', 'Age',
                'sentiment_pos', 'sentiment_neu', 'sentiment_neg', 'sentiment_compound', 'pos_ratio',
                'neg_ratio']].values,
    train_breed_health,
    train_image_features
])

test_features = np.hstack([
    test_text_data.toarray(),
    test_data[['has_image', 'num_images', 'avg_brightness', 'Age',
               'sentiment_pos', 'sentiment_neu', 'sentiment_neg', 'sentiment_compound', 'pos_ratio',
               'neg_ratio']].values,
    test_breed_health,
    test_image_features
])

In [31]:
# Standardizing the features
scaler = StandardScaler()
train_features_scaled = scaler.fit_transform(train_features)
test_features_scaled = scaler.transform(test_features)

X_train = scaler.fit_transform(train_features)
X_val = scaler.transform(test_features)

# Split training data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(train_features, train_data['AdoptionSpeed'], test_size=0.2,
                                                  random_state=42)

# Handle class imbalance using SMOTE
print("Applying SMOTE to handle class imbalance...")
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

Applying SMOTE to handle class imbalance...


In [32]:
# Compute class weights
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weight_dict = dict(enumerate(class_weights))

# Build the neural network
print("Building the neural network model...")
model = Sequential()
model.add(Dense(512, activation='relu', input_shape=(X_train.shape[1],)))
model.add(Dropout(0.5))
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dense(1, activation='linear'))

# Compile the model
model.compile(optimizer='adam', loss='mse', metrics=['mae'])

# Train the model with early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

model.fit(X_train_balanced, y_train_balanced, epochs=50, batch_size=32, validation_data=(X_val, y_val),class_weight=class_weight_dict)

# Evaluate the model on validation data
y_val_pred = model.predict(X_val)
y_val_pred_rounded = np.round(y_val_pred).astype(int)
y_val_pred_rounded = np.clip(y_val_pred_rounded, 1, 4)

# Calculate Cohen's Kappa score
kappa = cohen_kappa_score(y_val, y_val_pred_rounded, weights='quadratic')
print(f"Cohen's Kappa score: {kappa}")

Building the neural network model...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/50
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 60ms/step - loss: 4.5935 - mae: 1.4592 - val_loss: 1.9521 - val_mae: 1.1736
Epoch 2/50
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 75ms/step - loss: 1.4097 - mae: 0.9447 - val_loss: 1.8111 - val_mae: 1.1202
Epoch 3/50
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 59ms/step - loss: 1.4208 - mae: 0.8985 - val_loss: 2.1723 - val_mae: 1.1930
Epoch 4/50
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 67ms/step - loss: 1.0067 - mae: 0.8139 - val_loss: 1.9717 - val_mae: 1.1374
Epoch 5/50
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 67ms/step - loss: 0.9308 - mae: 0.7694 - val_loss: 2.0754 - val_mae: 1.1508
Epoch 6/50
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 64ms/step - loss: 0.8057 - mae: 0.7167 - val_loss: 2.0109 - val_mae: 1.1824
Epoch 7/50
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0

In [33]:
# Shape checks (for debugging)
print(f"Shape of test_features: {test_features.shape}")
print(f"Shape of test_data['PetID']: {test_data['PetID'].shape}")

# Make predictions on test set
predictions = model.predict(test_features)
predictions_rounded = np.round(predictions).astype(int)
predictions_rounded = np.clip(predictions_rounded, 1, 4)
print(f"Shape of predictions: {predictions.shape}")


predictions_rounded = predictions_rounded.flatten()
pet_ids = test_data['PetID'].tolist()

# Ensure the length matches the test data
if len(pet_ids) != len(predictions_rounded):
    raise ValueError("Length mismatch: PetID and predictions_rounded must be of the same length.")


# Create the submission DataFrame with PetID and the predicted AdoptionSpeed
submission = pd.DataFrame({
    'PetID': test_data['PetID'].astype(str),
    'AdoptionSpeed': predictions_rounded
})

Shape of test_features: (1891, 7074)
Shape of test_data['PetID']: (1891,)
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step
Shape of predictions: (1891, 1)


In [34]:
# Save the submission to a CSV file
submission.to_csv(project_dir + 'submission.csv', index=False)
print("Predictions saved successfully to 'submission.csv'.")

Predictions saved successfully to 'submission.csv'.
