In [None]:
import nbformat

nb = nbformat.read("main.ipynb", as_version=4)

# Remove widgets metadata
nb.metadata.pop("widgets", None)

nbformat.write(nb, "main.ipynb")


# EVA AI Hackathon — Submission Notebook Template

**Challenge:** `CH-05 — < AI Social Listening (Off-page Focus)>`  
**Participant / Team:** `<Omar Hamdy>`  
**Date:** `2026-02-06`  
**Runtime:** CPU / GPU (choose as needed)  
**Offline compliance:** No external APIs (unless explicitly allowed by organizers)

---

## Submission Package (Required)
This notebook is part of your final submission ZIP:

```
YourName_ProjectName.zip
  main.ipynb
  README.txt
  technical_report.pdf
  /data
  /output
```

- `main.ipynb`: this notebook (end-to-end pipeline)
- `README.txt` (max 1 page): run steps + expected runtime + hardware assumptions
- `technical_report.pdf` (max 2–3 pages): approach + design choices + limitations + failure modes
- `/output`: final outputs **with official file names and schemas**


## Quick Start (How to use this notebook)
1) Run cells from top to bottom.  
2) Make sure your inputs are placed under `/data`.  
3) Your code must write final submission files under `/output`.  
4) Do **not** change output file names/columns unless the official schema requires it.

> If the official dataset/schemas are released later, update only the **I/O Contract** section and the **Output Writer** section.


In [36]:
# === 0) Setup: basic imports ===
!pip install transformers torch pandas openpyxl scikit-learn tqdm
import os
import sys
import json
import random
import numpy as np
import pandas as pd
import re
from datetime import datetime
from transformers import pipeline
import warnings
warnings.filterwarnings('ignore')

print("Python:", sys.version)
print("Pandas:", pd.__version__)


Python: 3.12.12 (main, Oct 10 2025, 08:52:57) [GCC 11.4.0]
Pandas: 2.2.2


In [37]:
# === 0.1) Configuration (EDIT THESE) ===
CHALLENGE_CODE = "CH-05"
CHALLENGE_TITLE = "AI-Social-Listening-Off-Pages-Focus"
TEAM_NAME = "<Omar Hamdy>"

SEED = 42
random.seed(SEED)
np.random.seed(SEED)

os.makedirs("data", exist_ok=True)
os.makedirs("output", exist_ok=True)

DATA_DIR = "/content/data/eva_social_listening_INPUT_FINAL 6(in) 1.xlsx"
OUTPUT_DIR = "/content/output/social_listening_output (1).csv"

print("Challenge:", CHALLENGE_CODE, "-", CHALLENGE_TITLE)
print("Team:", TEAM_NAME)
print("SEED:", SEED)
print("DATA_DIR:", DATA_DIR)
print("OUTPUT_DIR:", OUTPUT_DIR)


Challenge: CH-05 - AI-Social-Listening-Off-Pages-Focus
Team: <Omar Hamdy>
SEED: 42
DATA_DIR: /content/data/eva_social_listening_INPUT_FINAL 6(in) 1.xlsx
OUTPUT_DIR: /content/output/social_listening_output (1).csv


## 1) I/O Contract (Update when official schema is released)

### Input location
- `/data/`

### Output location
- `/output/`

### Expected input files (examples — replace with official list)
- `<file_1>.csv` — short description
- `<file_2>.csv` — short description

### Required output files (examples — replace with official list)
- `<output_file>.csv` — short description

### Schema Reference
- Official schema source: **Data & Schema Addendum**
- This notebook must follow the official schema strictly (file names, columns, types, keys, constraints).


In [38]:
# === 2) Inspect input folder ===
if not os.path.exists(DATA_DIR):
    raise FileNotFoundError(f"DATA_DIR not found: {DATA_DIR}. Please create it and upload files under /data.")

print("Files in /data:")
for root, _, files in os.walk(DATA_DIR):
    for f in files:
        rel = os.path.relpath(os.path.join(root, f), DATA_DIR)
        print(" -", rel)


Files in /data:


## 3) Data Loading (Replace with your challenge-specific loading)

**Goal:** Load required input files from `/data` and create clean DataFrames/objects for downstream steps.

✅ Checklist
- All required input files exist  
- Correct encoding / delimiter  
- Dates parsed correctly (if applicable)  
- Keys are unique where required  


In [39]:
# === 3) Data Loading (TEMPLATE) ===
# TODO: Replace with official file names and loading logic.
# Example:
# sales = pd.read_csv(os.path.join(DATA_DIR, "sales.csv"))

data_objects = {}
print("TODO: load your input files into data_objects / DataFrames.")

input_file = os.path.join(DATA_DIR, 'social_listening_structure__1_.xlsx')
df = pd.read_excel(DATA_DIR)
print(df.head(3))


TODO: load your input files into data_objects / DataFrames.
      brand_name                            Brand social media link  \
0  Eva Cosmetics  https://noon.com/product/EVA-0031/review/menti...   
1  Eva Cosmetics  https://jumia.com/product/EVA-0047/review/ment...   
2  Eva Cosmetics                  https://facebook.com/posts/716113   

           product_mentioned Duration From Duration To Market  platform  \
0  Eva Strengthening Shampoo    2025-08-08  2026-02-03  Egypt      Noon   
1     إيفا Roll-On Whitening    2025-08-08  2026-02-03  Egypt     Jumia   
2          Eva Glycerin Soap    2025-08-08  2026-02-03  Egypt  Facebook   

  Source Type language                                text  \
0    Off-page       en    Average product, nothing special   
1    Off-page       ar             سعره مناسب وجودته عالية   
2    Off-page       en  PSA: Eva is having a sale! 50% off   

                                   engagement product_id     mention_id  \
0                       {'help

## 4) Data Validation & Cleaning (Keep it minimal but strict)

✅ Checklist (adapt to your challenge)
- No unexpected nulls in required fields  
- Handle duplicates / invalid rows  
- Validate ranges and allowed values  
- Document assumptions in README / report  


In [40]:
# === 4) Validation & Cleaning (TEMPLATE) ===
# TODO: Implement essential checks based on official schema.
def require_columns(df: pd.DataFrame, cols):
    missing = [c for c in cols if c not in df.columns]
    if missing:
        raise ValueError(f"Missing required columns: {missing}")

print("TODO: apply validation checks here.")

def clean_text(text):
    # Handle null/empty
    if pd.isna(text) or text == '':
        return "No content"

    # Convert to string
    text = str(text)

    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

    # Remove extra spaces
    text = ' '.join(text.split())

    # Strip
    text = text.strip()

    return text

TODO: apply validation checks here.


## 5) Feature Engineering / Core Logic

- If ML: feature extraction, encoding, and transformations  
- If Optimization: define objective and constraints  
- If NLP/CV: preprocessing + embeddings/feature pipeline  
- Avoid leakage: do not use future information in training/validation  


In [58]:
# === 5) Feature Engineering / Core Logic (TEMPLATE) ===
print("TODO: implement your feature engineering / optimization logic here.")


# English sentiment model
sentiment_en = pipeline(
    "sentiment-analysis",
    model="cardiffnlp/twitter-roberta-base-sentiment-latest"
)

# Arabic sentiment model
sentiment_ar = pipeline(
    "sentiment-analysis",
    model="CAMeL-Lab/bert-base-arabic-camelbert-msa-sentiment"
)



def classify_sentiment(text, language):
    """
    Classify sentiment using language-specific models
    """
    try:
        # Handle empty text
        if not text or len(text.strip()) < 3:
            return 'neutral'

        # Truncate long text (model limit)
        text = text[:512]

        # Use appropriate model based on language
        if language == 'ar':
            result = sentiment_ar(text)[0]
            label = result['label'].lower()

            if 'pos' in label:
                return 'positive'
            elif 'neg' in label:
                return 'negative'
            else:
                return 'neutral'

        else:  # English
            result = sentiment_en(text)[0]
            label = result['label'].lower()

            if 'pos' in label:
                return 'positive'
            elif 'neg' in label:
                return 'negative'
            else:
                return 'neutral'

    except:
        return 'neutral'


TODO: implement your feature engineering / optimization logic here.


Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]

RobertaForSequenceClassification LOAD REPORT from: cardiffnlp/twitter-roberta-base-sentiment-latest
Key                             | Status     |  | 
--------------------------------+------------+--+-
roberta.pooler.dense.bias       | UNEXPECTED |  | 
roberta.pooler.dense.weight     | UNEXPECTED |  | 
roberta.embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]

BertForSequenceClassification LOAD REPORT from: CAMeL-Lab/bert-base-arabic-camelbert-msa-sentiment
Key                          | Status     |  | 
-----------------------------+------------+--+-
bert.embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


## 6) Training / Inference

- Baseline first, then improved approach  
- Keep runtime reasonable  
- Log key artifacts (scores, parameters, counts)  


In [57]:
# === 6) Training / Inference (TEMPLATE) ===

("TODO: train model or run main inference pipeline here.")
from tqdm import tqdm
tqdm.pandas()

df['cleaned_text'] = df['text'].progress_apply(clean_text)
df['cleaned_product_mentioned'] = df['product_mentioned'].progress_apply(clean_text)


# Apply sentiment classification
df['sentiment'] = df.progress_apply(
    lambda row: classify_sentiment(row['cleaned_text'], row['language']),
    axis=1
)


print("\nSentiment Distribution:")
print(df['sentiment'].value_counts())

100%|██████████| 1400/1400 [00:00<00:00, 200725.56it/s]
100%|██████████| 1400/1400 [00:00<00:00, 282417.55it/s]
100%|██████████| 1400/1400 [02:11<00:00, 10.68it/s]


Sentiment Distribution:
sentiment
positive    559
neutral     425
negative    416
Name: count, dtype: int64





In [56]:
# === 6) Training / Inference (TEMPLATE) ===

# Define topic keywords
topic_keywords = {
    'product_quality': ['quality', 'effective', 'works', 'result', 'جودة', 'فعال', 'نتيجة', 'تأثير', 'مفيد'],
    'smell_fragrance': ['smell', 'scent', 'fragrance', 'perfume', 'ريحة', 'رائحة', 'عطر'],
    'price_value': ['price', 'expensive', 'cheap', 'cost', 'value', 'سعر', 'غالي', 'رخيص', 'تمن', 'فلوس'],
    'moisturizing': ['moisturize', 'dry', 'hydrat', 'soft', 'smooth', 'ترطيب', 'نشف', 'ناعم', 'رطوبة'],
    'longevity': ['last', 'lasting', 'stay', 'fade', 'ثبات', 'يدوم', 'يفضل', 'يروح'],
    'skin_reaction': ['irritat', 'allerg', 'rash', 'sensitive', 'burn', 'حساسية', 'تهيج', 'احمرار', 'حرق'],
    'hair_care': ['hair', 'shampoo', 'conditioner', 'شعر', 'شامبو', 'بلسم'],
    'customer_service': ['service', 'deliver', 'shipping', 'support', 'توصيل', 'خدمة', 'شحن'],
    'packaging': ['packag', 'bottle', 'container', 'box', 'عبوة', 'زجاجة', 'علبة']
}

def classify_topic(text):
    """Simple keyword-based topic classification"""
    text_lower = text.lower()

    # Count keyword matches for each topic
    topic_scores = {}
    for topic, keywords in topic_keywords.items():
        score = sum(1 for keyword in keywords if keyword in text_lower)
        topic_scores[topic] = score

    # Get topic with highest score
    if max(topic_scores.values()) > 0:
        return max(topic_scores, key=topic_scores.get)
    else:
        return 'general_mention'

# Apply topic classification
df['topic'] = df['cleaned_text'].progress_apply(classify_topic)

print("\n\nTopic Distribution:")
print(df['topic'].value_counts())


# Brand keywords
brand_keywords = ['eva', 'ايفا', 'إيفا', 'evacosmetics', 'eva cosmetics']

def calculate_relevance(text):
    """Simple relevance score based on brand mentions"""
    text_lower = text.lower()

    # Check for brand keywords
    for keyword in brand_keywords:
        if keyword in text_lower:
            return 1.0

    return 0.7

# Apply relevance scoring
df['relevance_score'] = df['cleaned_text'].apply(calculate_relevance)

print(f"\nAverage relevance: {df['relevance_score'].mean():.2f}")

100%|██████████| 1400/1400 [00:00<00:00, 68518.39it/s]



Topic Distribution:
topic
general_mention     675
price_value         174
smell_fragrance     118
moisturizing         88
skin_reaction        80
hair_care            72
customer_service     70
product_quality      70
longevity            53
Name: count, dtype: int64

Average relevance: 0.85





## 7) Generate Final Outputs (IMPORTANT)

Write final outputs to `/output` with:
- exact file names
- exact columns and types
- exact row-level requirements (unique keys, no duplicates if disallowed)
- schema compliance gate (hard-fail if incorrect)

✅ Schema Gate Checklist
- Output columns match schema exactly  
- Types match schema (int/float/string/date)  
- Required fields not missing  
- File name and folder correct  


In [59]:
# === 7) Create /output and write files (TEMPLATE) ===
#os.makedirs(OUTPUT_DIR, exist_ok=True)
print("Output folder ready:", OUTPUT_DIR)

# TODO: Replace with official output writer logic.
# Example:
# output_df.to_csv(os.path.join(OUTPUT_DIR, "predictions.csv"), index=False)

print("TODO: write final output file(s) to /output.")

# Prepare output DataFrame
output_df = df.copy()

# Select required columns
output_columns = [
    'mention_id',
    'platform',
    'source_name',
    'Brand social media link',
    'text',
    'brand_name',
    'language',
    'sentiment',
    'topic',
    'relevance_score',
    'engagement',
    'product_mentioned',
    'product_id'
]

# Create final output
final_output = output_df[output_columns].copy()


import json


# Top Topics
top_topics = final_output['topic'].value_counts().head(10)
topics_list = [
    {
        'topic': topic,
        'mention_count': int(count),
        'percentage': round(count / len(final_output) * 100, 2)
    }
    for topic, count in top_topics.items()
]

# Sentiment Distribution
sentiment_dist = final_output['sentiment'].value_counts()
sentiment_breakdown = {
    'positive': int(sentiment_dist.get('positive', 0)),
    'negative': int(sentiment_dist.get('negative', 0)),
    'neutral': int(sentiment_dist.get('neutral', 0))
}

# Alerts
negative_ratio = sentiment_breakdown['negative'] / len(final_output)
alerts = []

if negative_ratio > 0.30:
    alerts.append({
        'type': 'high_negative_sentiment',
        'severity': 'high',
        'message': f'High negative sentiment: {negative_ratio:.1%}',
        'timestamp': datetime.now().isoformat()
    })

# Compile insights
insights = {
    'generation_timestamp': datetime.now().isoformat(),
    'summary': {
        'total_mentions': len(final_output),
    },
    'top_topics': topics_list,
    'sentiment_distribution': sentiment_breakdown,
    'platform_distribution': final_output['platform'].value_counts().to_dict(),
    'language_distribution': final_output['language'].value_counts().to_dict(),
    'alerts': alerts
}

# Save insights
with open('insights.json', 'w', encoding='utf-8') as f:
    json.dump(insights, f, indent=2, ensure_ascii=False)

# CSV
final_output.to_csv('social_listening_output.csv', index=False, encoding='utf-8-sig')

# Download
from google.colab import files
files.download('social_listening_output.csv')
files.download('insights.json')

Output folder ready: /content/output/social_listening_output (1).csv
TODO: write final output file(s) to /output.


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [47]:
# === 7.1) Minimal schema checks (TEMPLATE) ===
# TODO: Replace with official schema checks.
def assert_no_duplicates(df: pd.DataFrame, key_cols):
    if df.duplicated(subset=key_cols).any():
        raise ValueError(f"Duplicates detected for key columns: {key_cols}")
assert_no_duplicates
print("TODO: validate output schema correctness here before submission.")


TODO: validate output schema correctness here before submission.


## 8) Final Notes & Submission Checklist

**Before you submit:**
- `/output` contains the required file(s) with official schema  
- Notebook runs end-to-end without manual edits  
- `README.txt` includes run steps + runtime + hardware assumptions  
- `technical_report.pdf` includes approach + design choices + limitations + failure modes  
- Colab link is shared as: **Anyone with the link can view**  


# **Accuracy Check**

In [50]:
# Load ground truth
ground_truth = pd.read_csv('/content/eva_ground_truth_FULL.csv')

# Merge and compare
comparison = final_output[['mention_id', 'sentiment']].merge(
    ground_truth[['mention_id', 'sentiment']],
    on='mention_id',
    suffixes=('_predicted', '_actual')
)

# Calculate accuracy
correct = (comparison['sentiment_predicted'] == comparison['sentiment_actual']).sum()
accuracy = correct / len(comparison) * 100

print(f"Sentiment Accuracy: {accuracy:.2f}%")
print(f"Correct: {correct}/{len(comparison)}")

Sentiment Accuracy: 58.86%
Correct: 824/1400


In [None]:
import nbformat

nb = nbformat.read("main.ipynb", as_version=4)

# Remove widgets metadata
nb.metadata.pop("widgets", None)

nbformat.write(nb, "main.ipynb")
