## STIMULER ASSIGNMENT - NIHAR MITTAL


In [20]:
!pip install pandas numpy scikit-learn



## Approach 1: Creating a recommendation system for multiple users.

## Import Libraries and Dependencies

This block imports essential libraries for data handling, preprocessing, machine learning model training, and evaluation.

In [21]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.multioutput import MultiOutputClassifier
from sklearn.cluster import KMeans
from sklearn.metrics import precision_score, recall_score, f1_score
from collections import defaultdict


## Generate Sample Data for Users

This section creates a simulated dataset for 10 users, with features such as country, age band, proficiency levels, errors, and scores in different language categories.

In [22]:
data = {
    'User_ID': [f'user_{i+1}' for i in range(10)],
    'Country': np.random.choice(['India', 'Japan', 'USA', 'Brazil', 'Germany'], 10),
    'Age_Band': np.random.choice(['Teen', 'Young Adult', 'Adult', 'Senior'], 10),
    'English_Proficiency_Level': np.random.choice(['Beginner', 'Intermediate', 'Advanced'], 10),
    'Grammar_Errors': np.random.randint(0, 10, 10),
    'Vocabulary_Errors': np.random.randint(0, 10, 10),
    'Pronunciation_Errors': np.random.randint(0, 10, 10),
    'Fluency_Errors': np.random.randint(0, 10, 10),
    'Grammar_Score': np.random.randint(50, 100, 10),
    'Vocabulary_Score': np.random.randint(50, 100, 10),
    'Pronunciation_Score': np.random.randint(50, 100, 10),
    'Fluency_Score': np.random.randint(50, 100, 10),
    'Exercises_Shown': np.random.choice(['Grammar', 'Vocabulary', 'Pronunciation', 'Fluency'], 10)
}
df = pd.DataFrame(data)


## Encoding Categorical Features

This block encodes categorical variables (Country, Age_Band, and English_Proficiency_Level) using LabelEncoder to convert them into numerical values suitable for machine learning algorithms.

In [23]:
label_enc = LabelEncoder()
df['Country'] = label_enc.fit_transform(df['Country'])
df['Age_Band'] = label_enc.fit_transform(df['Age_Band'])
df['English_Proficiency_Level'] = label_enc.fit_transform(df['English_Proficiency_Level'])


## Preprocessing Features (One-Hot Encoding and Scaling)

The OneHotEncoder is applied to categorical features, and numerical score features are scaled using StandardScaler. All preprocessed features are combined into a single feature matrix X.

In [24]:
onehot_enc = OneHotEncoder()
categorical_features = ['Country', 'Age_Band']
encoded_features = onehot_enc.fit_transform(df[categorical_features]).toarray()

scaler = StandardScaler()
score_features = ['Grammar_Score', 'Vocabulary_Score', 'Pronunciation_Score', 'Fluency_Score']
scaled_scores = scaler.fit_transform(df[score_features])

X = np.hstack([encoded_features, scaled_scores, df[['Grammar_Errors', 'Vocabulary_Errors', 'Pronunciation_Errors', 'Fluency_Errors']].values])


## Define Target Variable and Train-test Split for Training.

The target variable y is defined by identifying the language category with the most errors for each user (Grammar_Errors, Vocabulary_Errors, etc.).

This section splits the data into training and testing sets (70-30 split) using the train_test_split function.

In [25]:
df['max_error_category'] = df[['Grammar_Errors', 'Vocabulary_Errors', 'Pronunciation_Errors', 'Fluency_Errors']].idxmax(axis=1)
y = df['max_error_category']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)



The model is evaluated by making predictions on the test set. The accuracy_score and a detailed classification_report are printed to assess model performance. The results of this are not conclusive because the data is fake and very less. Also , we can change our algorithm based on the data we have.



In [26]:
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.0
Classification Report:
                       precision    recall  f1-score   support

      Fluency_Errors       0.00      0.00      0.00       0.0
      Grammar_Errors       0.00      0.00      0.00       2.0
Pronunciation_Errors       0.00      0.00      0.00       1.0
   Vocabulary_Errors       0.00      0.00      0.00       0.0

            accuracy                           0.00       3.0
           macro avg       0.00      0.00      0.00       3.0
        weighted avg       0.00      0.00      0.00       3.0



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


This function predicts the error category for a specific user by encoding their features and applying the trained RandomForest model.

In [27]:
def predict_error_category(user_data):
    encoded_user_data = np.hstack([onehot_enc.transform([[user_data['Country'], user_data['Age_Band']]]).toarray(),
                                   scaler.transform([[user_data['Grammar_Score'], user_data['Vocabulary_Score'],
                                                      user_data['Pronunciation_Score'], user_data['Fluency_Score']]]),
                                   np.array([[user_data['Grammar_Errors'], user_data['Vocabulary_Errors'],
                                              user_data['Pronunciation_Errors'], user_data['Fluency_Errors']]])])
    predicted_category = model.predict(encoded_user_data)[0]
    return predicted_category


## Cold Start: KMeans Clustering for New Users
KMeans clustering is applied to handle the cold start scenario for new users who lack historical data. It assigns new users to a cluster based on demographics and proficiency level.

In [28]:
kmeans = KMeans(n_clusters=3, random_state=42).fit(X)

def cold_start_recommendation(user_data, onehot_enc, scaler, kmeans):
    encoded_user_data = onehot_enc.transform([[user_data['Country'], user_data['Age_Band']]]).toarray()
    scaled_user_scores = scaler.transform([[user_data['Grammar_Score'], user_data['Vocabulary_Score'],
                                            user_data['Pronunciation_Score'], user_data['Fluency_Score']]])

    transformed_user_data = np.hstack([encoded_user_data, scaled_user_scores,
                                       np.array([[user_data['Grammar_Errors'], user_data['Vocabulary_Errors'],
                                                  user_data['Pronunciation_Errors'], user_data['Fluency_Errors']]])])

    cluster = kmeans.predict(transformed_user_data)[0]

    if cluster == 0:
        return "Start with basic grammar exercises."
    elif cluster == 1:
        return "Start with fluency exercises for smoother speech."
    else:
        return "Start with pronunciation and vocabulary flashcards."


## Generate Personalized Exercises Based on Error Category

This function generates personalized exercises based on the error category and user demographics (e.g., using anime references for Japanese users).

In [29]:
def generate_exercise(user_data, error_category):
    country = user_data['Country']
    proficiency_level = user_data['English_Proficiency_Level']

    if error_category == 'Grammar_Errors':
        if country == 'Japan':
            return "Grammar exercise with Anime references."
        elif country == 'India':
            return "Grammar exercise with Friends sitcom references."
        else:
            return "Standard grammar fill-in-the-blanks exercise."

    elif error_category == 'Vocabulary_Errors':
        if proficiency_level == 'Beginner':
            return "Vocabulary flashcards for basic words."
        else:
            return "Advanced vocabulary in context-based questions."

    elif error_category == 'Pronunciation_Errors':
        return "Pronunciation exercise with phonetic transcription and audio."

    elif error_category == 'Fluency_Errors':
        return "Fluency exercise with speed reading and pause detection."

    return "General exercise for English practice."


## Main Recommendation Function

This function handles both new and existing users. For existing users, it predicts the most common error category based on historical data. For new users, it applies the cold start method using KMeans clustering.

In [30]:
def recommend_exercise(user_id, user_data, historical_data, onehot_enc, scaler, kmeans):
    if user_id in historical_data['User_ID'].values:
        user_historical_data = historical_data[historical_data['User_ID'] == user_id]
        predicted_category = predict_error_category(user_historical_data.iloc[0])
        return generate_exercise(user_data, predicted_category)
    else:
        return cold_start_recommendation(user_data, onehot_enc, scaler, kmeans)

user_data_example = {
    'Country': 0,  # Assuming the user is from 'India' (encoded)
    'Age_Band': 2,  # Assuming 'Young Adult'
    'English_Proficiency_Level': 0,  # Beginner
    'Grammar_Errors': 3,
    'Vocabulary_Errors': 2,
    'Pronunciation_Errors': 1,
    'Fluency_Errors': 4,
    'Grammar_Score': 65,
    'Vocabulary_Score': 75,
    'Pronunciation_Score': 85,
    'Fluency_Score': 60
}

recommendation = recommend_exercise('user_001', user_data_example, df, onehot_enc, scaler, kmeans)
print("Recommended Exercise:", recommendation)



Recommended Exercise: Start with pronunciation and vocabulary flashcards.




## Approach 2: Focussing More on Analysis of one single user.

## Import Required Libraries

This block imports the necessary libraries for handling the user data, performing operations on historical data, and managing feedback features such as grammar, vocabulary, pronunciation, and fluency.

In [31]:
import numpy as np
import pandas as pd
from collections import defaultdict


## Simulate Historical User Data

This block simulates historical data for a single user. The data contains details about the user's interactions with the language learning system, including feedback on grammar, vocabulary, pronunciation, and fluency.

In [32]:
historical_data = [
    {
        "user_id": "user123",
        "timestamp": "2024-10-15T08:00:00Z",
        "utterance": "I didn't watch any movie, bro.",
        "feedback": {
            "grammar": {
                "score": 0.95,
                "errors": [
                    {"type": "article usage", "error": "any -> a", "impact": 0.5}
                ]
            },
            "vocabulary": {
                "score": 8.5,
                "suggestions": [
                    {"incorrect_word": "bright", "correct_word": "promising"}
                ]
            },
            "pronunciation": {
                "score": 91,
                "incorrect_words": [
                    {"word": "important", "accuracy": 56}
                ]
            },
            "fluency": {
                "score": 71,
                "feedback": {
                    "speed": "slow",
                    "wpm": 119,
                    "syllables_per_minute": 217
                }
            }
        }
    }
]


## Data Preprocessing and Feature Extraction
This block extracts relevant features from the historical data. It computes error frequencies for each feedback category (grammar, vocabulary, etc.) and calculates the average scores for each category. The goal is to create structured data that summarizes user performance across all language aspects.

In [33]:
def extract_features(user_data):
    categories = ["grammar", "vocabulary", "pronunciation", "fluency"]
    category_scores = defaultdict(list)
    error_frequencies = defaultdict(lambda: defaultdict(int))

    for entry in user_data:
        for category in categories:
            feedback = entry["feedback"].get(category, {})
            score = feedback.get("score", None)
            if score is not None:
                category_scores[category].append(score)

            if category == "grammar" and "errors" in feedback:
                for error in feedback["errors"]:
                    error_frequencies["grammar"][error["type"]] += 1
            elif category == "vocabulary" and "suggestions" in feedback:
                for suggestion in feedback["suggestions"]:
                    error_frequencies["vocabulary"][suggestion["incorrect_word"]] += 1
            elif category == "pronunciation" and "incorrect_words" in feedback:
                for word in feedback["incorrect_words"]:
                    error_frequencies["pronunciation"][word["word"]] += 1

    aggregated_features = {
        "category_scores": {cat: np.mean(scores) for cat, scores in category_scores.items()},
        "error_frequencies": error_frequencies
    }

    return aggregated_features

user_features = extract_features(historical_data)
print("Extracted Features:\n", user_features)


Extracted Features:
 {'category_scores': {'grammar': 0.95, 'vocabulary': 8.5, 'pronunciation': 91.0, 'fluency': 71.0}, 'error_frequencies': defaultdict(<function extract_features.<locals>.<lambda> at 0x7f4c6c8e23b0>, {'grammar': defaultdict(<class 'int'>, {'article usage': 1}), 'vocabulary': defaultdict(<class 'int'>, {'bright': 1}), 'pronunciation': defaultdict(<class 'int'>, {'important': 1})})}


## Selecting the Exercise Category

This block selects the category (grammar, vocabulary, pronunciation, or fluency) that should be prioritized for the next exercise. The selection is based on error frequencies and proficiency scores. Categories with higher error counts and lower scores are given more weight.

In [34]:
def select_exercise_category(features):
    weights = {}

    for category, score in features["category_scores"].items():
        if category in features["error_frequencies"]:
            error_count = sum(features["error_frequencies"][category].values())
            weights[category] = (1 - score / 100) * error_count

    selected_category = max(weights, key=weights.get)

    return selected_category, weights

selected_category, weights = select_exercise_category(user_features)
print(f"Selected Exercise Category: {selected_category}, Weights: {weights}")


Selected Exercise Category: grammar, Weights: {'grammar': 0.9905, 'vocabulary': 0.915, 'pronunciation': 0.08999999999999997}


#Simulating User Demographics and Preferences
This block contains demographic information about the user, such as country, age group, and interests (e.g., anime or movies). These preferences will be used to customize the exercises to make them more engaging.

This data structure can be expanded and improved. Country , state and more specific location of the user , exact age , interests can be more specific , we can ask that if the person is a student , professional etc. , we can ask what was their native language to get more insights for future users.

In [35]:
user_demographics = {
    "country": "Japan",
    "age_group": "18-25",
    "interests": ["anime", "movies"]
}


## Generating a Personalized Exercise
This block generates a personalized exercise based on the selected category and the user's interests. For example, users interested in anime may receive exercises with anime references, while users interested in movies might receive exercises based on movie dialogues. The content of the exercises is also adjusted based on the user’s common errors.

In [36]:
def generate_exercise(selected_category, user_demographics, user_features):
    exercise_content = {}

    if selected_category == "grammar":
        most_common_error = max(user_features["error_frequencies"]["grammar"], key=user_features["error_frequencies"]["grammar"].get)
        exercise_content["type"] = "fill-in-the-blank"
        exercise_content["instruction"] = f"Correct the sentence with the proper {most_common_error}"
        exercise_content["example"] = f"He watched __ movie. (Hint: article usage)"

    elif selected_category == "pronunciation":
        most_common_word = max(user_features["error_frequencies"]["pronunciation"], key=user_features["error_frequencies"]["pronunciation"].get)
        exercise_content["type"] = "pronunciation"
        exercise_content["instruction"] = f"Practice pronouncing the word: {most_common_word}"
        exercise_content["audio_clip"] = f"audio_{most_common_word}.mp3"

    if "anime" in user_demographics["interests"]:
        exercise_content["context"] = "anime"
    elif "movies" in user_demographics["interests"]:
        exercise_content["context"] = "movies"

    return exercise_content

exercise = generate_exercise(selected_category, user_demographics, user_features)
print("Generated Exercise:\n", exercise)


Generated Exercise:
 {'type': 'fill-in-the-blank', 'instruction': 'Correct the sentence with the proper article usage', 'example': 'He watched __ movie. (Hint: article usage)', 'context': 'anime'}
