<a href="https://colab.research.google.com/github/SofiaLebronPR/Linguistic_Mini_Project/blob/main/Mini_Linguistic_Research.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Workshop: Mini Linguistics Research — Live Survey Analysis**

This notebook analyzes student survey responses collected during the presentation.
Steps you'll see: load CSV → clean text → quick counts (deixis, emojis) → tiny sentiment → simple visuals.

# **1. Import Libraries**


In [None]:
import pandas as pd
import re, string
from collections import Counter
import matplotlib.pyplot as plt

# **2. Load CSV file**

In [None]:
# Path to your exported CSV (Google Forms)
CSV_PATH = r"C:.csv"

# **3. Read the file**

In [None]:
df = pd.read_csv(CSV_PATH)
print("Loaded rows:", len(df))
df.head()

# **4. Filter Consent**

In [None]:
# Keep only students who consented (Yes)
if "consent" in df.columns:
    df = df[df["consent"].astype(str).str.strip().str.lower().isin(["yes","y","sí","si"])]
else:
    print("Column 'consent' not found — skipping filter.")

# **5. Select Comment Column**

In [None]:
# Ensure text column exists
TEXT_COL = "comment_text"
if TEXT_COL not in df.columns:
    raise ValueError(f"Expected a '{TEXT_COL}' column with student comments.")


# **6. Normalize text**

In [None]:
df[TEXT_COL] = df[TEXT_COL].fillna("").astype(str)
print("Rows after consent filter:", len(df))
df[[TEXT_COL]].head()

# **7. Tokenization & Emoji Extraction**

In [None]:
# Simple tokenization (regex split, keep emojis separate)
EMOJI_RE = re.compile(r"[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF\U0001F1E0-\U0001F1FF]+")
WORD_RE = re.compile(r"[A-Za-zÁÉÍÓÚÜÑáéíóúüñ']+")

# Extract emojis from text
def extract_emojis(text):
    return EMOJI_RE.findall(text)
# Identifies each word with a token
def tokenize_words(text):
    return WORD_RE.findall(text)
# Normalize text (so everything "looks the same")
def normalize(text):
    return text.lower().strip()

# **8. Building the Token Lists**

In [None]:
all_emojis = []
all_words = []

for t in df[TEXT_COL]:
    t_norm = normalize(t)
    all_emojis.extend(extract_emojis(t_norm))
    all_words.extend([w.lower() for w in tokenize_words(t_norm)])

# **9. Counting Words, Emojis, and Bigrams**

In [None]:
word_counts = Counter(all_words)
emoji_counts = Counter(all_emojis)

# Print the Top 10 words so we can see them
print("Top 10 words:", word_counts.most_common(10))
print("Top emojis:", emoji_counts.most_common(10))

# Simple bigrams
bigrams = Counter(zip(all_words, all_words[1:]))
print("Top 10 bigrams:", bigrams.most_common(10))

# **10. Tiny Sentiment Analysis with mini  lexicon-based, EN + ES — toy model for class**

In [None]:
# Minimal bilingual sentiment wordlists (we can add more in class!)
pos_words = {
    "good","great","funny","hopeful","uplifting","love","cool","nice","wow",
    "bueno","buenisimo","buenísimo","gracioso","divertido","chévere","nítido","mejor","feliz","alegre","esperanza"
}
neg_words = {
    "bad","sad","angry","harsh","fake","worse","hate","mad","annoying","toxic",
    "malo","triste","enojado","falso","peor","odio","molesto","tóxico","preocupación","preocupado","miedo"
}

# A funtion to a count for the sentiment score
def sentiment_score(text):
    toks = [w.lower() for w in tokenize_words(text)]
    score = 0
    for w in toks:
        if w in pos_words:
            score += 1
        if w in neg_words:
            score -= 1
    return score

df["sentiment_score"] = df[TEXT_COL].apply(sentiment_score)
print(df[["sentiment_score", TEXT_COL]].head())

# **11. Mini Deixis / Indexicals Analysis (EN + ES pronouns/determiners)**

In [None]:
deixis_terms = {
    "self":["i","me","my","yo","mí","mi","mío","mía"],
    "group_in":["we","us","our","nosotros","nosotras","nuestro","nuestra"],
    "group_out":["they","them","their","ellos","ellas","su","sus"],
    "proximal":["this","here","aquí","este","esta","esto"],
    "distal":["that","there","allí","ese","esa","eso","allá"]
}
#Funtction to identify and acount for deixis/indexical
def count_deixis(text):
    toks = [w.lower() for w in tokenize_words(text)]
    counts = {k:0 for k in deixis_terms}
    for k, vocab in deixis_terms.items():
        counts[k] = sum(1 for w in toks if w in vocab)
    return counts
# How to present the results
deixis_results = df[TEXT_COL].apply(count_deixis)
deixis_df = pd.DataFrame(list(deixis_results))
summary_deixis = deixis_df.sum().sort_values(ascending=False)
summary_deixis

# **12. Language mixing (Code-Switching Ratio) (very rough): ES vs EN stopword hits**

In [None]:
es_sw = {"el","la","los","las","de","y","que","en","a","un","una","yo","nosotros","aquí","este","esa","eso","para","por"}
en_sw = {"the","and","that","in","to","a","we","i","here","this","that","for","of"}
# Funtion to identify and acount for code-switching
def lang_mix(text):
    toks = [w.lower() for w in tokenize_words(text)]
    es = sum(1 for w in toks if w in es_sw)
    en = sum(1 for w in toks if w in en_sw)
    return pd.Series({"es_hits":es, "en_hits":en, "mix_ratio": (min(es,en) / max(1, max(es,en)) )})
# How to present the results
df_lang = df[TEXT_COL].apply(lang_mix)
df = pd.concat([df, df_lang], axis=1)
df[["es_hits","en_hits","mix_ratio","sentiment_score"]].head()

# **13. Visuals**

## **a) Word Frequency**

In [None]:
# Top 10 words
wc10 = Counter([w for w in tokenize_words(" ".join(df[TEXT_COL].astype(str))) if len(w)>2]).most_common(10)
words, counts = zip(*wc10) if wc10 else ([], [])
plt.figure()
plt.bar(words, counts)
plt.title("Top 10 Words")
plt.xticks(rotation=30, ha="right")
plt.tight_layout()
plt.show()

## **b) Sentiment Distribution**

In [None]:
# Sentiment distribution
plt.figure()
df["sentiment_score"].value_counts().sort_index().plot(kind="bar")
plt.title("Sentiment Score Distribution")
plt.xlabel("score")
plt.ylabel("count")
plt.tight_layout()
plt.show()

## **c) Deixis/Indexicals**

In [None]:
# Deixis summary (bar)
plt.figure()
summary = pd.Series({"self":0,"group_in":0,"group_out":0,"proximal":0,"distal":0})
try:
    # 'summary_deixis' defined earlier
    summary = summary.add(summary_deixis, fill_value=0)
except NameError:
    pass
summary = summary.sort_values(ascending=False)
summary.plot(kind="bar")
plt.title("Deixis/Indexicals (Total Counts)")
plt.tight_layout()
plt.show()

# **14. (Optional) Simple KWIC — quick concordance**

In [None]:
def kwic(df, col, keyword, window=5, n=10):
    out = []
    kw = keyword.lower()
    for text in df[col].astype(str):
        toks = [w.lower() for w in re.findall(r"[A-Za-zÁÉÍÓÚÜÑáéíóúüñ']+", text)]
        for i, w in enumerate(toks):
            if w == kw:
                left = " ".join(toks[max(0, i-window):i])
                right = " ".join(toks[i+1:i+1+window])
                out.append((left, w, right))
                if len(out) >= n:
                    return out
    return out

for left, w, right in kwic(df, "comment_text", keyword="nosotros", window=4, n=5):
    print(f"{left} [{w}] {right}")