In [1]:
import pandas as pd
from pathlib import Path
from lingua import Language, LanguageDetectorBuilder

In [2]:
BASE_DIR = Path.cwd().parent
DATA_DIR = BASE_DIR / "data"
OUTPUT_DIR = BASE_DIR / "survey" / "output"

In [3]:
# INPUT_FILE = DATA_DIR / "targeted_tweets_1.csv"
# OUTPUT_FILE = OUTPUT_DIR / "non_english_samples_1.csv"

In [None]:
INPUT_FILE = DATA_DIR / "targeted_tweets_2.csv"
OUTPUT_FILE = OUTPUT_DIR / "non_english_samples_2.csv"

In [5]:
# Create language detector
languages = [
    Language.ENGLISH,      # USA, Philippines, Canada, Australia, UK, Singapore
    Language.SPANISH,      # Costa Rica, Guatemala, Venezuela, Spain
    # Language.ITALIAN,      # Italy
    # Language.TAGALOG,      # Philippines
    Language.FRENCH,       # Canada
    # Language.PORTUGUESE,   # Brazil
    Language.RUSSIAN,      # Russia
    Language.BENGALI,      # Bangladesh
    Language.CHINESE,      # Singapore
    Language.MALAY,        # Singapore
    Language.TAMIL,        # Singapore
]
# language_detector = LanguageDetectorBuilder.from_all_languages().build()
language_detector = LanguageDetectorBuilder.from_languages(*languages).build()

def detect_language(text):
    """Detect language with error handling"""
    try:
        detected_language = language_detector.detect_language_of(str(text))
        if detected_language is not None:
            return detected_language.iso_code_639_1.name.lower()
        return 'unknown'
    except Exception:
        return 'unknown'

In [6]:
df = pd.read_csv(INPUT_FILE)
df['detected_language'] = df['Tweet Text'].apply(detect_language)

In [7]:
# Non-English samples
df_non_english = df[df['detected_language'] != 'en']
df_non_english.head()

Unnamed: 0,Tweet ID,Tweet Text,Information Source,Information Type,Informativeness,source_dataset,detected_language
13,334472155234463744,Ni saben pronunciar el nombre de Dzhokhar Tsar...,Not applicable,Other Useful Information,Related - but not informative,2013_Boston_bombings,es
103,325601298768621569,"@BRSurabaya broh,ada menit diam u/ (alm) anne ...",Not applicable,Not applicable,Related - but not informative,2013_Boston_bombings,ms
113,323913577922715648,RT @inecascos: que huevos hacer estoy en una m...,Outsiders,Not applicable,Related - but not informative,2013_Boston_bombings,es
147,323932745900363776,"110 lesionados y dos muertos?y los que faltan,...",Not applicable,Not applicable,Related - but not informative,2013_Boston_bombings,es
238,324058965115822081,Què triste y horrible lo ocurrido en Boston! E...,Outsiders,Not applicable,Related - but not informative,2013_Boston_bombings,es


In [8]:
df_non_english.to_csv(OUTPUT_FILE, index=False)