## 1. Setup and Import Libraries

In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

# Configuration
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', 1000)

# Set style for better visualizations
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')

print('‚úì Libraries imported successfully')

‚úì Libraries imported successfully


## 2. Load the Data
- KEEP AS EXCEL SHEET BECAUSE OF THE FILE SIZE WHICH ISN'T VERY BIG ; OTHERWISE CSV FILES ARE FASTER 

In [9]:
# Load the Excel file
file_path = r"C:\Users\sabah\OneDrive\Desktop\trendyol_case\data\Interview Case - Language Data Analyst - For candidates.xlsx"
# Check all sheets
excel_file = pd.ExcelFile(file_path)
print(f"Sheets available: {excel_file.sheet_names}")

# Load the Data sheet
df = pd.read_excel(file_path, sheet_name='Data')
print(f"\n‚úì Data loaded: {df.shape[0]:,} rows √ó {df.shape[1]} columns")

Sheets available: ['Intro', 'Case', 'Data']

‚úì Data loaded: 17,898 rows √ó 18 columns


## 3. Initial Data Exploration

In [10]:
# Display first few rows
print("First 3 rows of data:")
df.head(3)

First 3 rows of data:


Unnamed: 0,ctmsId,externalId,namespace,contentType,createdAt,sourceLanguage,sourceText,targetLanguage,enReferenceTranslation,targetText,contentId,translationProvider,productViewCount,productRevenue,productURL,Evaluation,Root Cause,Comment
0,prod-qna_prod-qna_382773544_264427545_a,382773544_264427545_a,prod-qna,prod-qna,2024-11-10 18:21:16.242393 UTC,tr-tr,"Merhaba, √ºr√ºn√ºn b√ºt√ºn bedenler i√ßin i√ß boy √∂l√ß...",hu-hu,"Hello, the inner length of the product for all...","Sziasztok, a term√©k bels≈ë hossza minden m√©retn...",382773544,GoogleTranslate,314560.0,83568,https://www.trendyol.com/hu/pname-p-382773544,OK,,
1,prod-qna_prod-qna_729116297_287101008_a,729116297_287101008_a,prod-qna,prod-qna,2025-01-29 06:02:54.223384 UTC,tr-tr,"Merhaba, modelin boyu 173 cm. V√ºcut √∂l√ß√ºleri b...",hu-hu,"Hello, the model's height is 173 cm. Body meas...","Hello, a modell magass√°ga 173 cm. A testm√©rete...",729116297,GoogleTranslate,159493.0,50109,https://www.trendyol.com/hu/pname-p-729116297,OK,,
2,prod-qna_prod-qna_845564735_262708650_a,845564735_262708650_a,prod-qna,prod-qna,2024-11-05 09:24:28.002988 UTC,tr-tr,"Merhaba, ilgili sorularƒ±nƒ±zƒ± Hesabƒ±m-Trendyol ...",hu-hu,"Hello, we kindly ask you to submit your releva...","√údv√∂z√∂lj√ºk! K√©rj√ºk, tegye fel k√©rd√©seit a Fi√≥k...",845564735,GoogleTranslate,112840.0,51696,https://www.trendyol.com/hu/pname-p-845564735,OK,,


In [13]:
# Column information
print("Dataset Columns and Data Types:")
print("="*50)
for col in df.columns:
    non_null = df[col].notna().sum()
    null_pct = (df[col].isna().sum() / len(df)) * 100
    print(f"{col:<25} {str(df[col].dtype):<10} {non_null:>6}/{len(df)} ({null_pct:>5.1f}% missing)")

Dataset Columns and Data Types:
ctmsId                    object      17798/17898 (  0.6% missing)
externalId                object      17798/17898 (  0.6% missing)
namespace                 object      17898/17898 (  0.0% missing)
contentType               object      17898/17898 (  0.0% missing)
createdAt                 object      17372/17898 (  2.9% missing)
sourceLanguage            object      17898/17898 (  0.0% missing)
sourceText                object      17871/17898 (  0.2% missing)
targetLanguage            object      17898/17898 (  0.0% missing)
enReferenceTranslation    object      13382/17898 ( 25.2% missing)
targetText                object      17898/17898 (  0.0% missing)
contentId                 object      17219/17898 (  3.8% missing)
translationProvider       object      12989/17898 ( 27.4% missing)
productViewCount          float64     10184/17898 ( 43.1% missing)
productRevenue            int64       17898/17898 (  0.0% missing)
productURL                obje

In [14]:
# Basic statistics
print("\nüìä DATASET OVERVIEW")
print("="*50)
print(f"Total entries: {len(df):,}")
print(f"\nSource language: {df['sourceLanguage'].unique()}")
print(f"\nTarget languages ({df['targetLanguage'].nunique()}):")
for lang in sorted(df['targetLanguage'].unique()):
    count = (df['targetLanguage'] == lang).sum()
    print(f"  ‚Ä¢ {lang}: {count:,} entries")

print(f"\nTranslation providers ({df['translationProvider'].nunique()}):")
for provider in df['translationProvider'].value_counts().index:
    count = (df['translationProvider'] == provider).sum()
    print(f"  ‚Ä¢ {provider}: {count:,} entries")


üìä DATASET OVERVIEW
Total entries: 17,898

Source language: ['tr-tr']

Target languages (11):
  ‚Ä¢ ar-ae: 1,600 entries
  ‚Ä¢ bg-bg: 1,600 entries
  ‚Ä¢ cs-cz: 1,600 entries
  ‚Ä¢ de-de: 1,600 entries
  ‚Ä¢ el-gr: 1,600 entries
  ‚Ä¢ en-us: 2,079 entries
  ‚Ä¢ hu-hu: 1,507 entries
  ‚Ä¢ pl-pl: 1,562 entries
  ‚Ä¢ ro-ro: 1,600 entries
  ‚Ä¢ sk-sk: 1,600 entries
  ‚Ä¢ uk-ua: 1,550 entries

Translation providers (5):
  ‚Ä¢ Alibaba: 6,368 entries
  ‚Ä¢ GoogleTranslate: 3,903 entries
  ‚Ä¢ DeepL: 2,570 entries
  ‚Ä¢ ctms-translation-validation: 139 entries
  ‚Ä¢ GoogleAutoML: 9 entries


## 4. Data Cleaning

In [15]:
# Check for data quality issues
print("Data Quality Issues Found:")
print("="*50)

# Check Evaluation column for inconsistencies
print("\n1. Evaluation column values:")
print(df['Evaluation'].value_counts())

# Fix typos
df['Evaluation'] = df['Evaluation'].replace({
    'Not ok': 'Not OK',
    'Evvaluation Blocked': 'Evaluation Blocked'
})

print("\n‚úì Fixed typos in Evaluation column")
print("\nCleaned Evaluation values:")
print(df['Evaluation'].value_counts())

Data Quality Issues Found:

1. Evaluation column values:
Evaluation
OK                     10017
Ideal                   3727
Not OK                  2965
Evaluation Blocked       475
Evvaluation Blocked        1
Not ok                     1
Name: count, dtype: int64

‚úì Fixed typos in Evaluation column

Cleaned Evaluation values:
Evaluation
OK                    10017
Ideal                  3727
Not OK                 2966
Evaluation Blocked      476
Name: count, dtype: int64


## 5. Quality Metrics Calculation

In [16]:
# Overall quality distribution
print("üìà QUALITY DISTRIBUTION")
print("="*50)

quality_dist = df['Evaluation'].value_counts()
quality_pct = (quality_dist / len(df) * 100).round(1)

for eval_type, count in quality_dist.items():
    pct = quality_pct[eval_type]
    bar = '‚ñà' * int(pct/2)  # Visual bar
    print(f"{eval_type:<20} {count:>6} ({pct:>5.1f}%) {bar}")

# Calculate error rate
evaluated = df[df['Evaluation'].isin(['OK', 'Not OK', 'Ideal'])].shape[0]
not_ok = df[df['Evaluation'] == 'Not OK'].shape[0]
error_rate = (not_ok / evaluated) * 100

print(f"\nüéØ KEY METRIC: Overall Error Rate = {error_rate:.1f}%")
print(f"   (Calculated as Not OK / Total Evaluated)")

üìà QUALITY DISTRIBUTION
OK                    10017 ( 56.0%) ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà
Ideal                  3727 ( 20.8%) ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà
Not OK                 2966 ( 16.6%) ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà
Evaluation Blocked      476 (  2.7%) ‚ñà

üéØ KEY METRIC: Overall Error Rate = 17.7%
   (Calculated as Not OK / Total Evaluated)


In [17]:
# Quality Score Calculation
# Weighted score: Ideal = 1.5, OK = 1.0, Not OK = 0

ideal_count = (df['Evaluation'] == 'Ideal').sum()
ok_count = (df['Evaluation'] == 'OK').sum()
not_ok_count = (df['Evaluation'] == 'Not OK').sum()

quality_score = ((ideal_count * 1.5) + (ok_count * 1.0)) / evaluated

print(f"\nüìä QUALITY SCORE: {quality_score:.1%}")
print(f"   Calculation: (Ideal√ó1.5 + OK√ó1.0) / Total Evaluated")
print(f"   ‚Ä¢ Ideal: {ideal_count:,} √ó 1.5")
print(f"   ‚Ä¢ OK: {ok_count:,} √ó 1.0")
print(f"   ‚Ä¢ Not OK: {not_ok_count:,} √ó 0")


üìä QUALITY SCORE: 93.4%
   Calculation: (Ideal√ó1.5 + OK√ó1.0) / Total Evaluated
   ‚Ä¢ Ideal: 3,727 √ó 1.5
   ‚Ä¢ OK: 10,017 √ó 1.0
   ‚Ä¢ Not OK: 2,966 √ó 0


## 6. Save Cleaned Data

In [18]:
# Save cleaned data for next steps
df.to_csv('../outputs/cleaned_data.csv', index=False)
print("‚úì Cleaned data saved to: ../outputs/cleaned_data.csv")
print(f"  Shape: {df.shape}")
print(f"  File ready for Part 2 analysis")

‚úì Cleaned data saved to: ../outputs/cleaned_data.csv
  Shape: (17898, 18)
  File ready for Part 2 analysis


## Summary

### ‚úÖ Part 1 Completed!

**Key Findings:**
- Total entries: 17,898
- Overall error rate: 17.7%
- Quality score: 93.4%
- 11 target languages
- 5 translation providers

**Next Steps:**
- Part 2: Pattern Analysis by Language/Provider
- Part 3: Visualization Dashboard
- Part 4: Root Cause Analysis
- Part 5: Technical Solution Design