In [None]:
# # AI Model for Detecting Python Learning Preferences
# 
# This notebook trains a Random Forest classifier to detect whether users are looking to learn Python based on their queries and code examples.


In [None]:
# ## 1. Import Required Libraries


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import re
from textblob import TextBlob

from sklearn.model_selection import train_test_split, learning_curve
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

import warnings
warnings.filterwarnings('ignore')

# Set style for plots
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")


In [None]:
# ## 2. Load and Explore the Dataset


In [None]:
# Load the dataset
df = pd.read_csv('DataSet/student+performance/student/Python codes.csv')

# Display basic information
print("Dataset Shape:", df.shape)
print("\nFirst 5 rows:")
print(df.head())

print("\nDataset Info:")
print(df.info())

print("\nMissing Values:")
print(df.isnull().sum())


In [None]:
# ## 3. Data Preprocessing


In [None]:
# Create a copy for preprocessing
df_processed = df.copy()

# Feature engineering: Create features based on question content
def extract_features(text):
    """Extract features from question text"""
    features = {}

    # Convert to string
    text = str(text).lower()

    # Length features
    features['text_length'] = len(text)
    features['word_count'] = len(text.split())

    # Content features
    features['contains_python'] = int('python' in text)
    features['contains_learn'] = int(any(word in text for word in ['learn', 'study', 'teach', 'tutorial']))
    features['contains_code'] = int(any(word in text for word in ['code', 'program', 'function', 'script']))
    features['contains_example'] = int(any(word in text for word in ['example', 'sample', 'demonstrate']))
    features['contains_question'] = int('?' in text)

    # Programming concepts
    features['contains_loop'] = int(any(word in text for word in ['loop', 'for', 'while', 'iterate']))
    features['contains_function'] = int(any(word in text for word in ['function', 'def ', 'method']))
    features['contains_class'] = int(any(word in text for word in ['class', 'object', 'oop']))
    features['contains_array'] = int(any(word in text for word in ['array', 'list', '[]']))
    features['contains_string'] = int(any(word in text for word in ['string', 'str ', 'text']))

    # Difficulty indicators
    features['contains_basic'] = int(any(word in text for word in ['basic', 'simple', 'easy', 'beginner']))
    features['contains_advanced'] = int(any(word in text for word in ['advanced', 'complex', 'difficult', 'expert']))

    return features

# Extract features from questions
feature_columns = []
for idx, row in df_processed.iterrows():
    features = extract_features(row['question'])
    for key, value in features.items():
        df_processed.loc[idx, key] = value
        if key not in feature_columns:
            feature_columns.append(key)

# Analyze code complexity
def analyze_code_complexity(code):
    """Analyze code complexity features"""
    code = str(code)
    features = {}

    features['code_length'] = len(code)
    features['code_lines'] = code.count('\n') + 1
    features['has_function'] = int('def ' in code.lower())
    features['has_class'] = int('class ' in code.lower())
    features['has_import'] = int('import ' in code.lower())
    features['has_loop'] = int(any(word in code.lower() for word in ['for ', 'while ', 'range(']))
    features['has_conditional'] = int(any(word in code.lower() for word in ['if ', 'elif ', 'else:', 'switch', 'case']))

    return features

# Extract code features
code_feature_columns = []
for idx, row in df_processed.iterrows():
    features = analyze_code_complexity(row['code'])
    for key, value in features.items():
        df_processed.loc[idx, f'code_{key}'] = value
        if f'code_{key}' not in code_feature_columns:
            code_feature_columns.append(f'code_{key}')

# Combine all feature columns
all_feature_columns = feature_columns + code_feature_columns

print(f"Total features created: {len(all_feature_columns)}")
print("Feature columns:", all_feature_columns)


In [None]:
# ## 4. Create Target Variable (Learning Preference)


In [None]:
# Define rules for labeling Python learning preferences
def is_python_learning_query(text, code):
    """Determine if the query indicates Python learning preference"""
    text = str(text).lower()
    code = str(code).lower()

    # Keywords indicating learning intent
    learning_keywords = [
        'learn python', 'python tutorial', 'python example',
        'how to', 'what is', 'explain', 'understand',
        'beginner', 'starting', 'getting started',
        'teach me', 'show me', 'demonstrate'
    ]

    # Check for learning intent in text
    for keyword in learning_keywords:
        if keyword in text:
            return 1

    # Check for educational patterns in code
    if ('# example' in code or '# sample' in code or
        '# tutorial' in code or 'print(' in code):
        return 1

    # Check for basic programming concepts (often indicative of learning)
    basic_concepts = ['for i in range', 'def ', 'if __name__', 'import ']
    for concept in basic_concepts:
        if concept in code and len(code) < 500:  # Shorter code often indicates learning examples
            return 1

    return 0

# Apply labeling
df_processed['is_learning_python'] = df_processed.apply(
    lambda row: is_python_learning_query(row['question'], row['code']), axis=1
)

# Analyze label distribution
learning_count = df_processed['is_learning_python'].sum()
total_count = len(df_processed)
learning_percentage = (learning_count / total_count) * 100

print(f"Learning Python examples: {learning_count}/{total_count} ({learning_percentage:.2f}%)")
print("\nLabel Distribution:")
print(df_processed['is_learning_python'].value_counts())


In [None]:
# ## 5. Feature Analysis and Visualization


In [None]:
# Visualize feature distributions
fig, axes = plt.subplots(3, 3, figsize=(15, 12))
axes = axes.flatten()

# Select key features for visualization
key_features = [
    'contains_python', 'contains_learn', 'contains_code',
    'contains_example', 'contains_loop', 'contains_function',
    'contains_basic', 'code_has_function', 'code_has_loop'
]

for i, feature in enumerate(key_features[:9]):
    ax = axes[i]
    df_processed.groupby('is_learning_python')[feature].mean().plot(kind='bar', ax=ax)
    ax.set_title(f'{feature} by Learning Preference')
    ax.set_xlabel('Is Learning Python')
    ax.set_ylabel('Average Value')
    ax.set_xticklabels(['No', 'Yes'], rotation=0)

plt.tight_layout()
plt.show()


In [None]:
# ## 6. Prepare Data for Machine Learning


In [None]:
# Prepare features and target
X = df_processed[all_feature_columns]
y = df_processed['is_learning_python']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set size: {X_train.shape}")
print(f"Testing set size: {X_test.shape}")
print(f"\nTraining class distribution:\n{y_train.value_counts(normalize=True)}")
print(f"\nTesting class distribution:\n{y_test.value_counts(normalize=True)}")


In [None]:
# ## 7. Train Random Forest Classifier


In [None]:
# Initialize and train the model
rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42,
    class_weight='balanced'
)

rf_model.fit(X_train, y_train)

# Make predictions
y_pred = rf_model.predict(X_test)
y_pred_proba = rf_model.predict_proba(X_test)[:, 1]

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.4f}")
print(f"\nClassification Report:\n{classification_report(y_test, y_pred)}")


In [None]:
# ## 8. Model Evaluation and Visualization


In [None]:
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred, labels=rf_model.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Not Learning', 'Learning'])
disp.plot(cmap='Blues')
plt.title('Confusion Matrix')
plt.show()

# Feature Importance
feature_importance = pd.DataFrame({
    'feature': all_feature_columns,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

print("Top 10 Most Important Features:")
print(feature_importance.head(10))

# Plot feature importance
plt.figure(figsize=(12, 6))
sns.barplot(data=feature_importance.head(15), x='importance', y='feature')
plt.title('Top 15 Feature Importances')
plt.xlabel('Importance')
plt.tight_layout()
plt.show()

# Learning Curve
train_sizes, train_scores, test_scores = learning_curve(
    rf_model, X, y, cv=5, n_jobs=-1,
    train_sizes=np.linspace(0.1, 1.0, 10)
)

train_scores_mean = np.mean(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)

plt.figure(figsize=(10, 6))
plt.plot(train_sizes, train_scores_mean, label='Training score')
plt.plot(train_sizes, test_scores_mean, label='Cross-validation score')
plt.xlabel('Training Size')
plt.ylabel('Score')
plt.title('Learning Curve')
plt.legend()
plt.grid(True)
plt.show()


In [None]:
# ## 9. Create Text-Based Features and Alternative Model


In [None]:
# Create text-based features using TF-IDF
vectorizer = TfidfVectorizer(
    max_features=50,
    stop_words='english',
    ngram_range=(1, 2)
)

# Combine question and code for text analysis
text_data = df_processed['question'].astype(str) + " " + df_processed['code'].astype(str)
X_text = vectorizer.fit_transform(text_data)

# Split text data
X_text_train, X_text_test, y_text_train, y_text_test = train_test_split(
    X_text, y, test_size=0.2, random_state=42, stratify=y
)

# Train text-based model
rf_text_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    random_state=42,
    class_weight='balanced'
)

rf_text_model.fit(X_text_train, y_text_train)

# Evaluate text model
y_text_pred = rf_text_model.predict(X_text_test)
text_accuracy = accuracy_score(y_text_test, y_text_pred)

print(f"Text Model Accuracy: {text_accuracy:.4f}")
print(f"\nText Model Classification Report:\n{classification_report(y_text_test, y_text_pred)}")


In [None]:
# ## 10. Save the Models and Vectorizer


In [None]:
# Save the main model
joblib.dump(rf_model, 'python_learning_detector_rf.pkl')

# Save the text-based model
joblib.dump(rf_text_model, 'python_learning_detector_text_rf.pkl')

# Save the vectorizer
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')

# Save feature names
with open('feature_names.txt', 'w') as f:
    for feature in all_feature_columns:
        f.write(f"{feature}\n")

print("Models and artifacts saved successfully!")


In [None]:
# ## 11. Create Prediction Function


In [None]:
def predict_python_learning(query, code_snippet=None):
    """
    Predict if a query indicates Python learning preference

    Parameters:
    query (str): User's question or query
    code_snippet (str): Associated code (optional)

    Returns:
    dict: Prediction results with confidence
    """
    if code_snippet is None:
        code_snippet = ""

    # Extract features from query
    features = extract_features(query)
    code_features = analyze_code_complexity(code_snippet)

    # Create feature vector
    feature_vector = []
    for col in all_feature_columns:
        if col in features:
            feature_vector.append(features[col])
        elif f"code_{col.replace('code_', '')}" in code_features:
            feature_vector.append(code_features[col.replace('code_', '')])
        else:
            feature_vector.append(0)

    # Make prediction
    prediction = rf_model.predict([feature_vector])[0]
    probability = rf_model.predict_proba([feature_vector])[0][1]

    # Text-based prediction
    text_input = query + " " + code_snippet
    text_vector = vectorizer.transform([text_input])
    text_prediction = rf_text_model.predict(text_vector)[0]
    text_probability = rf_text_model.predict_proba(text_vector)[0][1]

    # Combine predictions (weighted average)
    combined_probability = (probability * 0.6) + (text_probability * 0.4)
    final_prediction = 1 if combined_probability > 0.5 else 0

    # Determine confidence level
    if combined_probability > 0.8:
        confidence = "High"
    elif combined_probability > 0.6:
        confidence = "Medium"
    else:
        confidence = "Low"

    # Extract key learning indicators
    learning_indicators = []
    if features.get('contains_learn', 0):
        learning_indicators.append("Contains learning keywords")
    if features.get('contains_python', 0):
        learning_indicators.append("Mentions Python")
    if features.get('contains_basic', 0):
        learning_indicators.append("Beginners/ Basic level")
    if features.get('contains_example', 0):
        learning_indicators.append("Seeks examples")

    return {
        'prediction': final_prediction,
        'prediction_label': 'Learning Python' if final_prediction == 1 else 'Not Learning Python',
        'confidence': confidence,
        'probability': float(combined_probability),
        'feature_based_probability': float(probability),
        'text_based_probability': float(text_probability),
        'learning_indicators': learning_indicators,
        'key_features': {k: v for k, v in features.items() if v == 1}
    }


In [None]:
# ## 12. Test the Model with Examples


In [None]:
def predict_python_learning(query, code_snippet=None):
    """
    Predict if a query indicates Python learning preference

    Parameters:
    query (str): User's question or query
    code_snippet (str): Associated code (optional)

    Returns:
    dict: Prediction results with confidence
    """
    if code_snippet is None:
        code_snippet = ""

    # Extract features from query
    features = extract_features(query)
    code_features = analyze_code_complexity(code_snippet)

    # Create feature vector in correct order
    feature_vector = []
    for col in all_feature_columns:
        if col in features:
            feature_vector.append(features[col])
        elif col.startswith('code_'):
            # Remove 'code_' prefix to match code_features keys
            code_key = col.replace('code_', '')
            feature_vector.append(code_features.get(code_key, 0))
        else:
            feature_vector.append(0)

    # Make prediction
    prediction = rf_model.predict([feature_vector])[0]
    probability = rf_model.predict_proba([feature_vector])[0][1]

    # Text-based prediction
    text_input = query + " " + code_snippet
    text_vector = vectorizer.transform([text_input])
    text_prediction = rf_text_model.predict(text_vector)[0]
    text_probability = rf_text_model.predict_proba(text_vector)[0][1]

    # Combine predictions
    combined_probability = (probability * 0.6) + (text_probability * 0.4)
    final_prediction = 1 if combined_probability > 0.5 else 0

    # Confidence level
    if combined_probability > 0.8:
        confidence = "High"
    elif combined_probability > 0.6:
        confidence = "Medium"
    else:
        confidence = "Low"

    # Extract learning indicators
    learning_indicators = []
    if features.get('contains_learn', 0):
        learning_indicators.append("Contains learning keywords")
    if features.get('contains_python', 0):
        learning_indicators.append("Mentions Python")
    if features.get('contains_basic', 0):
        learning_indicators.append("Beginners/Basic level")
    if features.get('contains_example', 0):
        learning_indicators.append("Seeks examples")

    return {
        'prediction': final_prediction,
        'prediction_label': 'Learning Python' if final_prediction == 1 else 'Not Learning Python',
        'confidence': confidence,
        'probability': float(combined_probability),
        'feature_based_probability': float(probability),
        'text_based_probability': float(text_probability),
        'learning_indicators': learning_indicators,
        'key_features': {k: v for k, v in features.items() if v == 1}
    }


In [None]:
# ## 13. Create API-like Interface


In [61]:
class PythonLearningDetector:
    """
    API interface for Python learning preference detection
    """

    def __init__(self):
        """Initialize the detector with saved models"""
        self.rf_model = joblib.load('python_learning_detector_rf.pkl')
        self.rf_text_model = joblib.load('python_learning_detector_text_rf.pkl')
        self.vectorizer = joblib.load('tfidf_vectorizer.pkl')

        # Load feature names
        with open('feature_names.txt', 'r') as f:
            self.feature_names = [line.strip() for line in f]

    def predict(self, query, code=None):
        """
        Predict if the input indicates Python learning preference

        Args:
            query (str): User query/question
            code (str, optional): Associated code snippet

        Returns:
            dict: Prediction results
        """
        if code is None:
            code = ""

        # Get prediction
        result = predict_python_learning(query, code)

        # Add additional analysis
        result['query_length'] = len(query)
        result['code_length'] = len(code) if code else 0
        result['has_code'] = bool(code.strip())

        # Categorize query type
        if 'how to' in query.lower():
            result['query_type'] = 'how-to'
        elif 'what is' in query.lower() or 'what are' in query.lower():
            result['query_type'] = 'definition'
        elif 'example' in query.lower() or 'sample' in query.lower():
            result['query_type'] = 'example_request'
        elif 'difference' in query.lower() or 'compare' in query.lower():
            result['query_type'] = 'comparison'
        else:
            result['query_type'] = 'general'

        return result

    def batch_predict(self, queries):
        """
        Predict learning preferences for multiple queries

        Args:
            queries (list): List of dictionaries with 'query' and optional 'code'

        Returns:
            list: List of prediction results
        """
        results = []
        for item in queries:
            query = item.get('query', '')
            code = item.get('code', None)
            results.append(self.predict(query, code))
        return results

# Example usage
print("\nAPI Interface Example:")
print("=" * 60)

detector = PythonLearningDetector()

# Single prediction
test_query = "Can you teach me how to write a Python function?"
test_code = "def add_numbers(a, b):\n    return a + b"

result = detector.predict(test_query, test_code)
print(f"Query: {test_query}")
print(f"Prediction: {result['prediction_label']}")
print(f"Query Type: {result['query_type']}")
print(f"Confidence: {result['confidence']}")



API Interface Example:
Query: Can you teach me how to write a Python function?
Prediction: Learning Python
Query Type: how-to
Confidence: Medium


In [None]:
# ## 14. Model Deployment Recommendations


In [None]:
print("""
Model Deployment Recommendations:
=================================

1. **Integration Options:**
   - REST API using Flask/FastAPI
   - Streamlit dashboard for visualization
   - Browser extension for real-time detection
   - IDE plugin for code learning assistance

2. **Monitoring:**
   - Track prediction accuracy over time
   - Monitor feature drift
   - Collect user feedback for model improvement

3. **Improvement Strategies:**
   - Regular retraining with new data
   - Active learning from user corrections
   - Ensemble with other ML algorithms
   - Incorporate user interaction data

4. **Use Cases:**
   - Personalized learning path recommendations
   - Adaptive difficulty adjustment
   - Content filtering for learning platforms
   - Automated tutoring system triggers
""")


In [None]:
# ## 15. Export Final Analysis Report


In [60]:
# Create comprehensive analysis report
analysis_report = {
    'model_performance': {
        'accuracy': float(accuracy),
        'text_model_accuracy': float(text_accuracy),
        'training_samples': len(X_train),
        'testing_samples': len(X_test)
    },
    'feature_analysis': {
        'total_features': len(all_feature_columns),
        'top_features': feature_importance.head(10)['feature'].tolist(),
        'most_important_feature': feature_importance.iloc[0]['feature']
    },
    'dataset_analysis': {
        'total_samples': len(df_processed),
        'learning_samples': int(learning_count),
        'non_learning_samples': int(total_count - learning_count),
        'learning_percentage': float(learning_percentage)
    },
    'model_configuration': {
        'algorithm': 'Random Forest',
        'n_estimators': 100,
        'max_depth': 10,
        'class_weight': 'balanced'
    }
}

# Save analysis report
import json
with open('model_analysis_report.json', 'w') as f:
    json.dump(analysis_report, f, indent=2)

print("Analysis report saved as 'model_analysis_report.json'")

# Display summary
print("\n" + "="*60)
print("MODEL TRAINING COMPLETE")
print("="*60)
print(f"Final Model Accuracy: {accuracy:.2%}")
print(f"Learning Samples Detected: {learning_count}/{total_count}")
print(f"Top Feature: {analysis_report['feature_analysis']['most_important_feature']}")
print("Models saved: python_learning_detector_rf.pkl")
print("Ready for deployment!")

Analysis report saved as 'model_analysis_report.json'

MODEL TRAINING COMPLETE
Final Model Accuracy: 86.17%
Learning Samples Detected: 7736/13815
Top Feature: code_has_function
Models saved: python_learning_detector_rf.pkl
Ready for deployment!
