## Recommendation To Improve Language Learning

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, Concatenate, Dot, BatchNormalization, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import EarlyStopping

# Load the data
df = pd.read_csv('data.csv')

# Preprocessing
le = LabelEncoder()
categorical_columns = ['category', 'difficulty', 'cultural_context', 'user_country', 'user_age_group', 'user_proficiency']
for col in categorical_columns:
    df[col] = le.fit_transform(df[col])

df['user_id'] = le.fit_transform(df['user_id'])
df['exercise_id'] = le.fit_transform(df['exercise_id'])

# Create a mapping from exercise_id to cultural_context
exercise_to_context = {
    101: 'Anime',
    102: 'Kpop',
    103: 'Kdrama',
    104: 'Horror',
    105: 'Sitcom',
    106: 'Memes',
    107: 'Sports',
    108: 'Informative'
}

# Splitting the data
train, test = train_test_split(df, test_size=0.2, random_state=42)

# Model parameters
n_users = df['user_id'].nunique()
n_exercises = df['exercise_id'].nunique()
n_factors = 50
n_categories = df['category'].nunique()
n_difficulties = df['difficulty'].nunique()
n_cultural_contexts = df['cultural_context'].nunique()
n_countries = df['user_country'].nunique()
n_age_groups = df['user_age_group'].nunique()
n_proficiencies = df['user_proficiency'].nunique()

# Model architecture
user_input = Input(shape=(1,), name='user_input')
exercise_input = Input(shape=(1,), name='exercise_input')
category_input = Input(shape=(1,), name='category_input')
difficulty_input = Input(shape=(1,), name='difficulty_input')
cultural_context_input = Input(shape=(1,), name='cultural_context_input')
user_country_input = Input(shape=(1,), name='user_country_input')
user_age_group_input = Input(shape=(1,), name='user_age_group_input')
user_proficiency_input = Input(shape=(1,), name='user_proficiency_input')

# Collaborative filtering part
user_embedding = Embedding(n_users, n_factors, embeddings_regularizer=l2(1e-6))(user_input)
exercise_embedding = Embedding(n_exercises, n_factors, embeddings_regularizer=l2(1e-6))(exercise_input)
user_vec = Flatten()(user_embedding)
exercise_vec = Flatten()(exercise_embedding)
cf_output = Dot(axes=1)([user_vec, exercise_vec])

# Content-based part
category_embedding = Embedding(n_categories, 10)(category_input)
difficulty_embedding = Embedding(n_difficulties, 5)(difficulty_input)
cultural_context_embedding = Embedding(n_cultural_contexts, 10)(cultural_context_input)
user_country_embedding = Embedding(n_countries, 10)(user_country_input)
user_age_group_embedding = Embedding(n_age_groups, 5)(user_age_group_input)
user_proficiency_embedding = Embedding(n_proficiencies, 5)(user_proficiency_input)

concat = Concatenate()([
    Flatten()(category_embedding),
    Flatten()(difficulty_embedding),
    Flatten()(cultural_context_embedding),
    Flatten()(user_country_embedding),
    Flatten()(user_age_group_embedding),
    Flatten()(user_proficiency_embedding)
])

# More complex ANN architecture
dense1 = Dense(64, activation='relu')(concat)
bn1 = BatchNormalization()(dense1)
dense2 = Dense(32, activation='relu')(bn1)
bn2 = BatchNormalization()(dense2)
dropout1 = Dropout(0.3)(bn2)
dense3 = Dense(16, activation='relu')(dropout1)
bn3 = BatchNormalization()(dense3)
dense4 = Dense(8, activation='relu')(bn3)
bn4 = BatchNormalization()(dense4)
cb_output = Dense(1)(bn4)

# Combine CF and CB outputs
combined_output = Dense(1)(Concatenate()([cf_output, cb_output]))

model = Model(
    inputs=[user_input, exercise_input, category_input, difficulty_input, cultural_context_input, user_country_input, user_age_group_input, user_proficiency_input],
    outputs=combined_output
)

model.compile(loss='mse', optimizer=Adam(learning_rate=0.001))

# Early stopping to prevent overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Training the model
history = model.fit(
    [train['user_id'], train['exercise_id'], train['category'], train['difficulty'], train['cultural_context'], train['user_country'], train['user_age_group'], train['user_proficiency']],
    train['rating'],
    epochs=200,
    batch_size=32,
    validation_split=0.1,
    callbacks=[early_stopping],
    verbose=1
)

# Function to get top N recommendations for a user
def get_top_n_recommendations(user_id, n=2):
    user_exercises = df[df['user_id'] == user_id]['exercise_id'].unique()
    all_exercises = df['exercise_id'].unique()
    exercises_to_predict = np.setdiff1d(all_exercises, user_exercises)
    
    if len(exercises_to_predict) == 0:
        return []
    
    user_data = df[df['user_id'] == user_id].iloc[0]
    user_vector = np.array([user_id] * len(exercises_to_predict))
    exercise_vector = exercises_to_predict
    category_vector = np.array([user_data['category']] * len(exercises_to_predict))
    difficulty_vector = np.array([user_data['difficulty']] * len(exercises_to_predict))
    cultural_context_vector = np.array([user_data['cultural_context']] * len(exercises_to_predict))
    user_country_vector = np.array([user_data['user_country']] * len(exercises_to_predict))
    user_age_group_vector = np.array([user_data['user_age_group']] * len(exercises_to_predict))
    user_proficiency_vector = np.array([user_data['user_proficiency']] * len(exercises_to_predict))
    
    predictions = model.predict([user_vector, exercise_vector, category_vector, difficulty_vector, cultural_context_vector, user_country_vector, user_age_group_vector, user_proficiency_vector])
    top_n_indices = predictions.flatten().argsort()[-n:][::-1]
    recommended_exercises = exercises_to_predict[top_n_indices]
    return [exercise_to_context.get(ex + 101, 'Unknown') for ex in recommended_exercises]

# Cold start strategy
def cold_start_recommendation(user_data, n=2):
    # For new users, recommend based on their proficiency level and country
    similar_users = df[
        (df['user_proficiency'] == user_data['user_proficiency']) &
        (df['user_country'] == user_data['user_country'])
    ]
    
    if len(similar_users) == 0:
        # If no similar users found, recommend popular exercises
        popular_exercises = df.groupby('exercise_id')['rating'].mean().sort_values(ascending=False)
        recommended_exercises = popular_exercises.head(n).index.tolist()
    else:
        # Recommend exercises popular among similar users
        popular_exercises = similar_users.groupby('exercise_id')['rating'].mean().sort_values(ascending=False)
        recommended_exercises = popular_exercises.head(n).index.tolist()
    
    return [exercise_to_context.get(ex + 101, 'Unknown') for ex in recommended_exercises]

# Example usage for existing user
user_id = 0  # Assuming user IDs start from 0 after encoding
top_recommendations = get_top_n_recommendations(user_id, n=2)
print(f"Top exercise recommendations for user {user_id}: {top_recommendations}")

# Example usage for cold start (new user)
new_user_data = {
    'user_proficiency': df['user_proficiency'].mode().iloc[0],  # Most common proficiency
    'user_country': df['user_country'].mode().iloc[0]  # Most common country
}
cold_start_recs = cold_start_recommendation(new_user_data)
print(f"Cold start recommendations: {cold_start_recs}")

Epoch 1/200




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5s/step - loss: 13.8610 - val_loss: 16.9906
Epoch 2/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 68ms/step - loss: 13.7922 - val_loss: 16.9719
Epoch 3/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step - loss: 13.8032 - val_loss: 16.9526
Epoch 4/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 79ms/step - loss: 13.7696 - val_loss: 16.9335
Epoch 5/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 71ms/step - loss: 13.5934 - val_loss: 16.9148
Epoch 6/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 67ms/step - loss: 13.5705 - val_loss: 16.8923
Epoch 7/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 66ms/step - loss: 13.5711 - val_loss: 16.8730
Epoch 8/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 84ms/step - loss: 13.4686 - val_loss: 16.8501
Epoch 9/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m

### Evalution

In [2]:
# Convert test data to numpy arrays
user_id_test = np.array(test['user_id'])
exercise_id_test = np.array(test['exercise_id'])
category_test = np.array(test['category'])
difficulty_test = np.array(test['difficulty'])
cultural_context_test = np.array(test['cultural_context'])
user_country_test = np.array(test['user_country'])
user_age_group_test = np.array(test['user_age_group'])
user_proficiency_test = np.array(test['user_proficiency'])

# Predict on test data
test_predictions = model.predict([
    user_id_test,
    exercise_id_test,
    category_test,
    difficulty_test,
    cultural_context_test,
    user_country_test,
    user_age_group_test,
    user_proficiency_test
])

# Calculate MSE and RMSE
mse = mean_squared_error(test['rating'], test_predictions)
rmse = np.sqrt(mse)

print(f"Test MSE: {mse}")
print(f"Test RMSE: {rmse}")



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 247ms/step
Test MSE: 0.6777856046786497
Test RMSE: 0.82327735586414


In [3]:
import matplotlib.pyplot as plt

# Plot training & validation loss values
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend(loc='upper right')
plt.show()


ModuleNotFoundError: No module named 'matplotlib'

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, Concatenate, BatchNormalization, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping


In [None]:
# Load the data
df = pd.read_csv('data.csv')
df

Unnamed: 0,user_id,exercise_id,category,difficulty,cultural_context,user_country,user_age_group,user_proficiency,rating
0,1,101,Grammar,Easy,Anime,Japan,18-25,Beginner,4
1,1,102,Vocabulary,Medium,Kpop,Japan,18-25,Beginner,3
2,1,103,Pronunciation,Hard,Kdrama,Japan,18-25,Beginner,2
3,2,104,Grammar,Easy,Horror,India,26-35,Intermediate,5
4,2,101,Vocabulary,Medium,Anime,India,26-35,Intermediate,4
5,2,102,Pronunciation,Hard,Kpop,India,26-35,Intermediate,3
6,3,103,Grammar,Easy,Kdrama,USA,36-45,Advanced,5
7,3,104,Vocabulary,Medium,Horror,USA,36-45,Advanced,4
8,3,101,Pronunciation,Hard,Anime,USA,36-45,Advanced,3
9,4,102,Grammar,Easy,Kpop,South Korea,18-25,Beginner,4


In [None]:
# Encode categorical columns
le = LabelEncoder()
categorical_columns = ['category', 'difficulty', 'cultural_context', 'user_country', 'user_age_group', 'user_proficiency']
for col in categorical_columns:
    df[col] = le.fit_transform(df[col])


In [None]:
train, test = train_test_split(df, test_size=0.2, random_state=42)
train


Unnamed: 0,user_id,exercise_id,category,difficulty,cultural_context,user_country,user_age_group,user_proficiency,rating
5,2,102,Pronunciation,Hard,Kpop,India,26-35,Intermediate,3
11,4,104,Pronunciation,Hard,Horror,South Korea,18-25,Beginner,2
3,2,104,Grammar,Easy,Horror,India,26-35,Intermediate,5
18,7,103,Grammar,Easy,Kdrama,India,18-25,Beginner,4
16,6,101,Vocabulary,Medium,Anime,Japan,36-45,Advanced,5
13,5,102,Vocabulary,Medium,Kpop,China,26-35,Intermediate,3
2,1,103,Pronunciation,Hard,Kdrama,Japan,18-25,Beginner,2
9,4,102,Grammar,Easy,Kpop,South Korea,18-25,Beginner,4
20,7,101,Pronunciation,Hard,Anime,India,18-25,Beginner,2
4,2,101,Vocabulary,Medium,Anime,India,26-35,Intermediate,4


In [None]:
test

Unnamed: 0,user_id,exercise_id,category,difficulty,cultural_context,user_country,user_age_group,user_proficiency,rating
0,1,101,Grammar,Easy,Anime,Japan,18-25,Beginner,4
17,6,102,Pronunciation,Hard,Kpop,Japan,36-45,Advanced,3
15,6,104,Grammar,Easy,Horror,Japan,36-45,Advanced,4
1,1,102,Vocabulary,Medium,Kpop,Japan,18-25,Beginner,3
8,3,101,Pronunciation,Hard,Anime,USA,36-45,Advanced,3


In [None]:
# Define model parameters
n_categories = df['category'].nunique()
n_difficulties = df['difficulty'].nunique()
n_cultural_contexts = df['cultural_context'].nunique()
n_countries = df['user_country'].nunique()
n_age_groups = df['user_age_group'].nunique()
n_proficiencies = df['user_proficiency'].nunique()

print(f"Number of categories: {n_categories}")
print(f"Number of difficulties: {n_difficulties}")
print(f"Number of cultural contexts: {n_cultural_contexts}")
print(f"Number of countries: {n_countries}")
print(f"Number of age groups: {n_age_groups}")
print(f"Number of proficiencies: {n_proficiencies}")


Number of categories: 3
Number of difficulties: 3
Number of cultural contexts: 4
Number of countries: 5
Number of age groups: 3
Number of proficiencies: 3


In [None]:
# Content-based model architecture
category_input = Input(shape=(1,), name='category_input')
difficulty_input = Input(shape=(1,), name='difficulty_input')
cultural_context_input = Input(shape=(1,), name='cultural_context_input')
user_country_input = Input(shape=(1,), name='user_country_input')
user_age_group_input = Input(shape=(1,), name='user_age_group_input')
user_proficiency_input = Input(shape=(1,), name='user_proficiency_input')

print(category_input)
print(difficulty_input)
print(cultural_context_input)
print(user_country_input)
print(user_age_group_input)
print(user_proficiency_input)

<KerasTensor shape=(None, 1), dtype=float32, sparse=False, name=category_input>
<KerasTensor shape=(None, 1), dtype=float32, sparse=False, name=difficulty_input>
<KerasTensor shape=(None, 1), dtype=float32, sparse=False, name=cultural_context_input>
<KerasTensor shape=(None, 1), dtype=float32, sparse=False, name=user_country_input>
<KerasTensor shape=(None, 1), dtype=float32, sparse=False, name=user_age_group_input>
<KerasTensor shape=(None, 1), dtype=float32, sparse=False, name=user_proficiency_input>


In [None]:
# Embedding layers
category_embedding = Embedding(n_categories, 10)(category_input)
difficulty_embedding = Embedding(n_difficulties, 5)(difficulty_input)
cultural_context_embedding = Embedding(n_cultural_contexts, 10)(cultural_context_input)
user_country_embedding = Embedding(n_countries, 10)(user_country_input)
user_age_group_embedding = Embedding(n_age_groups, 5)(user_age_group_input)
user_proficiency_embedding = Embedding(n_proficiencies, 5)(user_proficiency_input)

print(category_embedding)
print(difficulty_embedding)
print(cultural_context_embedding)
print(user_country_embedding)
print(user_age_group_embedding)
print(user_proficiency_embedding)


<KerasTensor shape=(None, 1, 10), dtype=float32, sparse=False, name=keras_tensor_45>
<KerasTensor shape=(None, 1, 5), dtype=float32, sparse=False, name=keras_tensor_46>
<KerasTensor shape=(None, 1, 10), dtype=float32, sparse=False, name=keras_tensor_47>
<KerasTensor shape=(None, 1, 10), dtype=float32, sparse=False, name=keras_tensor_48>
<KerasTensor shape=(None, 1, 5), dtype=float32, sparse=False, name=keras_tensor_49>
<KerasTensor shape=(None, 1, 5), dtype=float32, sparse=False, name=keras_tensor_50>


In [None]:
# Concatenate all embeddings
concat = Concatenate()([
    Flatten()(category_embedding),
    Flatten()(difficulty_embedding),
    Flatten()(cultural_context_embedding),
    Flatten()(user_country_embedding),
    Flatten()(user_age_group_embedding),
    Flatten()(user_proficiency_embedding)
])

In [None]:

# Fully connected layers
dense1 = Dense(64, activation='relu')(concat)
bn1 = BatchNormalization()(dense1)
dense2 = Dense(32, activation='relu')(bn1)
bn2 = BatchNormalization()(dense2)
dropout1 = Dropout(0.3)(bn2)
dense3 = Dense(16, activation='relu')(dropout1)
bn3 = BatchNormalization()(dense3)
dense4 = Dense(8, activation='relu')(bn3)
bn4 = BatchNormalization()(dense4)
output = Dense(1)(bn4)  # Output layer for rating prediction

# Build and compile the model
model = Model(
    inputs=[category_input, difficulty_input, cultural_context_input, user_country_input, user_age_group_input, user_proficiency_input],
    outputs=output
)
model.compile(loss='mse', optimizer=Adam(learning_rate=0.001))

# Early stopping to prevent overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Train the model
history = model.fit(
    [train['category'], train['difficulty'], train['cultural_context'], train['user_country'], train['user_age_group'], train['user_proficiency']],
    train['rating'],
    epochs=200,
    batch_size=32,
    validation_split=0.1,
    callbacks=[early_stopping],
    verbose=1
)

# Example: Predict ratings for a user with certain characteristics
def predict_rating(user_features):
    prediction = model.predict([
        np.array([user_features['category']]),
        np.array([user_features['difficulty']]),
        np.array([user_features['cultural_context']]),
        np.array([user_features['user_country']]),
        np.array([user_features['user_age_group']]),
        np.array([user_features['user_proficiency']])
    ])
    return prediction[0][0]

# Example usage
user_features = {
    'category': df['category'].mode().iloc[0],
    'difficulty': df['difficulty'].mode().iloc[0],
    'cultural_context': df['cultural_context'].mode().iloc[0],
    'user_country': df['user_country'].mode().iloc[0],
    'user_age_group': df['user_age_group'].mode().iloc[0],
    'user_proficiency': df['user_proficiency'].mode().iloc[0]
}
predicted_rating = predict_rating(user_features)
print(f"Predicted rating: {predicted_rating}")


Epoch 1/200




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5s/step - loss: 15.5238 - val_loss: 16.8832
Epoch 2/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 97ms/step - loss: 13.8238 - val_loss: 16.8643
Epoch 3/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 72ms/step - loss: 14.1466 - val_loss: 16.8155
Epoch 4/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 79ms/step - loss: 13.4323 - val_loss: 16.7993
Epoch 5/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 77ms/step - loss: 13.0523 - val_loss: 16.7678
Epoch 6/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 94ms/step - loss: 13.7099 - val_loss: 16.7582
Epoch 7/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 71ms/step - loss: 13.0272 - val_loss: 16.7548
Epoch 8/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 71ms/step - loss: 13.2966 - val_loss: 16.7615
Epoch 9/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 308ms/step
Predicted rating: 0.879662275314331
