In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from nltk.sentiment import SentimentIntensityAnalyzer
import nltk
from sklearn.impute import SimpleImputer
from textstat import flesch_reading_ease, flesch_kincaid_grade
import re
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# Download required NLTK data
try:
    nltk.download('vader_lexicon', quiet=True)
    nltk.download('punkt', quiet=True)
except:
    pass

class DatingProfileAnalyzer:
    def __init__(self):
        self.attractiveness_model = None
        self.compatibility_model = None
        self.feature_importance = {}
        
    def load_data(self, filepath='profiles.csv'):
        """Load and perform initial data exploration"""
        self.df = pd.read_csv(filepath)
        print(f"Dataset shape: {self.df.shape}")
        print(f"Missing values per column:\n{self.df.isnull().sum().head(10)}")
        return self.df
    
    def create_advanced_features(self):
        """Create comprehensive feature engineering"""
        print("Creating advanced features...")
        
        # Basic profile completeness
        self.df['completeness'] = self.df.notna().mean(axis=1)
        
        # Essay features
        essay_cols = [col for col in self.df.columns if 'essay' in col]
        self.df['total_essays'] = self.df[essay_cols].notna().sum(axis=1)
        
        # Combined essay text
        self.df['all_essays'] = self.df[essay_cols].fillna('').apply(
            lambda x: ' '.join(x.astype(str)), axis=1
        )
        
        # Text length and complexity features
        self.df['essay_length'] = self.df['all_essays'].apply(len)
        self.df['word_count'] = self.df['all_essays'].apply(lambda x: len(x.split()))
        self.df['sentence_count'] = self.df['all_essays'].apply(
            lambda x: len(re.split(r'[.!?]+', x)) if x else 0
        )
        self.df['avg_word_length'] = self.df['all_essays'].apply(
            lambda x: np.mean([len(word) for word in x.split()]) if x.split() else 0
        )
        
        # Reading complexity (with error handling)
        def safe_flesch_reading_ease(text):
            try:
                if text and len(text.strip()) > 10:
                    return flesch_reading_ease(text)
                else:
                    return 50  # neutral score
            except:
                return 50
        
        self.df['reading_ease'] = self.df['all_essays'].apply(safe_flesch_reading_ease)
        
        # Sentiment analysis
        sid = SentimentIntensityAnalyzer()
        sentiment_scores = self.df['all_essays'].apply(
            lambda x: sid.polarity_scores(x) if x else {'compound': 0, 'pos': 0, 'neg': 0, 'neu': 0}
        )
        self.df['sentiment_compound'] = [score['compound'] for score in sentiment_scores]
        self.df['sentiment_positive'] = [score['pos'] for score in sentiment_scores]
        self.df['sentiment_negative'] = [score['neg'] for score in sentiment_scores]
        
        # Personality indicators from text
        self.df['exclamation_count'] = self.df['all_essays'].apply(lambda x: x.count('!'))
        self.df['question_count'] = self.df['all_essays'].apply(lambda x: x.count('?'))
        self.df['emoji_count'] = self.df['all_essays'].apply(
            lambda x: len(re.findall(r'[😀-🙏]', x)) if x else 0
        )
        
        # Lifestyle consistency features
        lifestyle_cols = ['drinks', 'smokes', 'drugs']
        for col in lifestyle_cols:
            if col in self.df.columns:
                self.df[f'{col}_filled'] = self.df[col].notna().astype(int)
        
        # Age-related features
        if 'age' in self.df.columns:
            self.df['age_group'] = pd.cut(self.df['age'], 
                                        bins=[0, 25, 30, 35, 40, 100], 
                                        labels=['18-25', '26-30', '31-35', '36-40', '40+'])
        
        # Photo-related features (if available)
        if 'photos' in self.df.columns:
            self.df['has_photos'] = self.df['photos'].notna().astype(int)
        
        print(f"Created {len([col for col in self.df.columns if col not in self.original_columns])} new features")
    
    def create_attractiveness_target(self, method='composite'):
        """Create sophisticated attractiveness target variable"""
        if method == 'composite':
            # Weighted composite score
            weights = {
                'completeness': 0.25,
                'essay_quality': 0.35,
                'engagement': 0.20,
                'lifestyle': 0.20
            }
            
            # Normalize essay length (longer essays show effort)
            essay_score = np.clip(self.df['essay_length'] / self.df['essay_length'].quantile(0.9), 0, 1)
            
            # Engagement score (sentiment + interactivity)
            engagement_score = (
                (self.df['sentiment_compound'] + 1) / 2 * 0.6 +  # Normalize sentiment to 0-1
                np.clip((self.df['exclamation_count'] + self.df['question_count']) / 10, 0, 1) * 0.4
            )
            
            # Lifestyle score (completeness in lifestyle fields)
            lifestyle_cols = ['drinks', 'smokes', 'drugs', 'diet']
            lifestyle_score = self.df[[col for col in lifestyle_cols if col in self.df.columns]].notna().mean(axis=1)
            
            # Composite attractiveness score
            self.df['attractiveness_score'] = (
                weights['completeness'] * self.df['completeness'] +
                weights['essay_quality'] * essay_score +
                weights['engagement'] * engagement_score +
                weights['lifestyle'] * lifestyle_score
            )
            
        # Binary target for classification
        self.df['high_attractiveness'] = (
            self.df['attractiveness_score'] > self.df['attractiveness_score'].quantile(0.7)
        ).astype(int)
        
        print(f"Attractiveness distribution: {self.df['high_attractiveness'].value_counts().to_dict()}")
    
    def create_compatibility_pairs(self, n_pairs=2000):
        """Create more sophisticated compatibility pairs with reduced memory usage"""
        np.random.seed(42)
        
        print(f"Creating {n_pairs} user pairs...")
        
        # Sample a subset of users for pair creation to manage memory
        n_users = min(10000, len(self.df))  # Limit to 10k users max
        sampled_users = self.df.sample(n=n_users, random_state=42).index
        
        # Create stratified pairs (some positive, some negative examples)
        positive_pairs = []
        negative_pairs = []
        
        # Group users by key characteristics for positive pairs
        for orientation in self.df.loc[sampled_users, 'orientation'].dropna().unique()[:5]:  # Limit orientations
            subset = self.df[(self.df.index.isin(sampled_users)) & (self.df['orientation'] == orientation)]
            if len(subset) > 1:
                # Create positive pairs within same orientation
                n_pos_pairs = min(50, len(subset)//2)  # Reduced number
                for _ in range(n_pos_pairs):
                    if len(positive_pairs) >= n_pairs // 3:  # Limit positive pairs
                        break
                    pair = np.random.choice(subset.index, 2, replace=False)
                    positive_pairs.append({'user1': pair[0], 'user2': pair[1], 'label': 1})
        
        # Create negative pairs (random pairs)
        remaining_pairs = n_pairs - len(positive_pairs)
        for _ in range(remaining_pairs):
            pair = np.random.choice(sampled_users, 2, replace=False)
            negative_pairs.append({'user1': pair[0], 'user2': pair[1], 'label': 0})
        
        # Combine pairs
        all_pairs = positive_pairs + negative_pairs
        self.user_pairs = pd.DataFrame(all_pairs)
        
        # Add pair features
        self._add_pair_features()
        
        print(f"Created {len(self.user_pairs)} user pairs")
        print(f"Compatibility distribution: {self.user_pairs['compatible'].value_counts().to_dict()}")
    
    def _add_pair_features(self):
        """Add features for user pairs"""
        # Merge user data
        user1_data = self.df.add_suffix('_1')
        user2_data = self.df.add_suffix('_2')
        
        self.user_pairs = self.user_pairs.merge(
            user1_data[['age_1', 'orientation_1', 'diet_1', 'religion_1', 'attractiveness_score_1']], 
            left_on='user1', right_index=True, how='left'
        )
        self.user_pairs = self.user_pairs.merge(
            user2_data[['age_2', 'orientation_2', 'diet_2', 'religion_2', 'attractiveness_score_2']], 
            left_on='user2', right_index=True, how='left'
        )
        
        # Compatibility features
        self.user_pairs['age_diff'] = abs(self.user_pairs['age_1'] - self.user_pairs['age_2'])
        self.user_pairs['age_compatible'] = (self.user_pairs['age_diff'] <= 10).astype(int)
        
        # Shared characteristics
        for col in ['orientation', 'diet', 'religion']:
            if f'{col}_1' in self.user_pairs.columns:
                self.user_pairs[f'same_{col}'] = (
                    self.user_pairs[f'{col}_1'] == self.user_pairs[f'{col}_2']
                ).astype(int)
        
        # Attractiveness compatibility
        self.user_pairs['attractiveness_diff'] = abs(
            self.user_pairs['attractiveness_score_1'] - self.user_pairs['attractiveness_score_2']
        )
        self.user_pairs['attractiveness_match'] = (self.user_pairs['attractiveness_diff'] <= 0.2).astype(int)
        
        # Create target based on multiple factors
        self.user_pairs['compatible'] = (
            (self.user_pairs.get('same_orientation', 0) == 1) & 
            (self.user_pairs['age_compatible'] == 1) &
            (self.user_pairs['attractiveness_match'] == 1)
        ).astype(int)
    
    def build_attractiveness_model(self):
        """Build enhanced attractiveness prediction model"""
        print("Building attractiveness prediction model...")
        
        # Feature selection
        text_features = ['all_essays']
        categorical_features = ['age_group', 'body_type', 'diet', 'drinks', 'orientation']
        numerical_features = [
            'completeness', 'essay_length', 'word_count', 'reading_ease',
            'sentiment_compound', 'sentiment_positive', 'exclamation_count'
        ]
        
        # Filter available features
        categorical_features = [f for f in categorical_features if f in self.df.columns]
        numerical_features = [f for f in numerical_features if f in self.df.columns]
        
        # Prepare data
        feature_cols = text_features + categorical_features + numerical_features
        X = self.df[feature_cols].copy()
        y = self.df['high_attractiveness'].copy()
        
        # Handle missing values in text
        X['all_essays'] = X['all_essays'].fillna('')
        
        # Preprocessing pipeline with reduced memory usage
        preprocessor = ColumnTransformer([
            ('text', TfidfVectorizer(max_features=500, stop_words='english', ngram_range=(1,2)), 'all_essays'),
            ('cat', Pipeline([
                ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
                ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
            ]), categorical_features),
            ('num', Pipeline([
                ('imputer', SimpleImputer(strategy='median')),
                ('scaler', StandardScaler())
            ]), numerical_features)
        ])
        
        # Model pipeline with hyperparameter tuning
        self.attractiveness_model = Pipeline([
            ('preprocessor', preprocessor),
            ('classifier', RandomForestClassifier(random_state=42))
        ])
        
        # Simplified hyperparameter tuning for large datasets
        param_grid = {
            'classifier__n_estimators': [100, 200],
            'classifier__max_depth': [15, None],
            'classifier__min_samples_split': [5, 10]
        }
        
        # Split data
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
        
        # Grid search with memory optimization
        grid_search = GridSearchCV(
            self.attractiveness_model, 
            param_grid, 
            cv=3, 
            scoring='roc_auc', 
            n_jobs=1,  # Avoid memory issues with parallel processing
            verbose=1
        )
        grid_search.fit(X_train, y_train)
        
        self.attractiveness_model = grid_search.best_estimator_
        
        # Evaluate
        train_preds = self.attractiveness_model.predict(X_train)
        test_preds = self.attractiveness_model.predict(X_test)
        test_probs = self.attractiveness_model.predict_proba(X_test)[:, 1]
        
        print(f"Best parameters: {grid_search.best_params_}")
        print(f"Train Accuracy: {accuracy_score(y_train, train_preds):.3f}")
        print(f"Test Accuracy: {accuracy_score(y_test, test_preds):.3f}")
        print(f"Test ROC-AUC: {roc_auc_score(y_test, test_probs):.3f}")
        print(f"Test F1: {f1_score(y_test, test_preds):.3f}")
        
        return X_test, y_test, test_preds, test_probs
    
    def build_compatibility_model(self):
        """Build enhanced compatibility matching model"""
        print("\nBuilding compatibility prediction model...")
        
        # Feature selection
        compatibility_features = [
            'age_diff', 'age_compatible', 'attractiveness_diff', 'attractiveness_match'
        ]
        
        # Add categorical matching features if available
        for col in ['orientation', 'diet', 'religion']:
            if f'same_{col}' in self.user_pairs.columns:
                compatibility_features.append(f'same_{col}')
        
        # Prepare data
        X = self.user_pairs[compatibility_features].copy()
        y = self.user_pairs['compatible'].copy()
        
        # Handle missing values
        X = X.fillna(0)
        
        # Split data
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42, stratify=y
        )
        
        # Try multiple models
        models = {
            'RandomForest': RandomForestClassifier(n_estimators=100, random_state=42),
            'GradientBoosting': GradientBoostingClassifier(random_state=42),
            'LogisticRegression': LogisticRegression(random_state=42, max_iter=1000)
        }
        
        best_score = 0
        best_model = None
        
        for name, model in models.items():
            # Cross-validation
            cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='f1')
            avg_score = cv_scores.mean()
            
            print(f"{name} CV F1: {avg_score:.3f} (+/- {cv_scores.std() * 2:.3f})")
            
            if avg_score > best_score:
                best_score = avg_score
                best_model = model
        
        # Train best model
        self.compatibility_model = best_model
        self.compatibility_model.fit(X_train, y_train)
        
        # Evaluate
        train_preds = self.compatibility_model.predict(X_train)
        test_preds = self.compatibility_model.predict(X_test)
        
        print(f"\nBest model: {type(best_model).__name__}")
        print(f"Train F1: {f1_score(y_train, train_preds):.3f}")
        print(f"Test F1: {f1_score(y_test, test_preds):.3f}")
        print(f"Test Accuracy: {accuracy_score(y_test, test_preds):.3f}")
        
        # Feature importance
        if hasattr(self.compatibility_model, 'feature_importances_'):
            importance_df = pd.DataFrame({
                'feature': compatibility_features,
                'importance': self.compatibility_model.feature_importances_
            }).sort_values('importance', ascending=False)
            print(f"\nTop feature importances:")
            print(importance_df.head())
        
        return X_test, y_test, test_preds
    
    def predict_attractiveness(self, profile_data):
        """Predict attractiveness for new profile"""
        if self.attractiveness_model is None:
            raise ValueError("Attractiveness model not trained yet!")
        
        # Convert to DataFrame if needed
        if isinstance(profile_data, dict):
            profile_data = pd.DataFrame([profile_data])
        
        prediction = self.attractiveness_model.predict(profile_data)[0]
        probability = self.attractiveness_model.predict_proba(profile_data)[0]
        
        return {
            'prediction': bool(prediction),
            'probability_high': probability[1],
            'confidence': max(probability)
        }
    
    def predict_compatibility(self, user1_id, user2_id):
        """Predict compatibility between two users"""
        if self.compatibility_model is None:
            raise ValueError("Compatibility model not trained yet!")
        
        # Create pair features
        pair_data = self._create_pair_features(user1_id, user2_id)
        
        prediction = self.compatibility_model.predict([pair_data])[0]
        if hasattr(self.compatibility_model, 'predict_proba'):
            probability = self.compatibility_model.predict_proba([pair_data])[0]
        else:
            probability = [1-prediction, prediction]
        
        return {
            'prediction': bool(prediction),
            'probability_compatible': probability[1] if len(probability) > 1 else prediction,
            'confidence': max(probability) if len(probability) > 1 else abs(prediction - 0.5) + 0.5
        }
    
    def _create_pair_features(self, user1_id, user2_id):
        """Create features for a specific user pair"""
        user1 = self.df.loc[user1_id]
        user2 = self.df.loc[user2_id]
        
        features = []
        
        # Age difference
        features.append(abs(user1.get('age', 30) - user2.get('age', 30)))
        features.append(int(features[0] <= 10))  # age_compatible
        
        # Attractiveness difference
        attr_diff = abs(user1.get('attractiveness_score', 0.5) - user2.get('attractiveness_score', 0.5))
        features.append(attr_diff)
        features.append(int(attr_diff <= 0.2))  # attractiveness_match
        
        # Categorical matches
        for col in ['orientation', 'diet', 'religion']:
            if col in user1 and col in user2:
                features.append(int(user1[col] == user2[col]))
            else:
                features.append(0)
        
        return features
    
    def generate_insights(self):
        """Generate insights from the trained models"""
        insights = {
            'dataset_summary': {
                'total_profiles': len(self.df),
                'high_attractiveness_rate': self.df['high_attractiveness'].mean(),
                'avg_essay_length': self.df['essay_length'].mean(),
                'avg_completeness': self.df['completeness'].mean()
            }
        }
        
        if hasattr(self, 'user_pairs'):
            insights['compatibility_summary'] = {
                'total_pairs': len(self.user_pairs),
                'compatibility_rate': self.user_pairs['compatible'].mean(),
                'avg_age_difference': self.user_pairs['age_diff'].mean()
            }
        
        return insights
    
    def visualize_results(self, X_test_attr=None, y_test_attr=None, test_probs_attr=None,
                         X_test_comp=None, y_test_comp=None, test_preds_comp=None):
        """Create visualizations of model performance"""
        fig, axes = plt.subplots(2, 3, figsize=(18, 12))
        
        # Attractiveness model visualizations
        if test_probs_attr is not None:
            # ROC Curve
            from sklearn.metrics import roc_curve
            fpr, tpr, _ = roc_curve(y_test_attr, test_probs_attr)
            axes[0, 0].plot(fpr, tpr, label=f'ROC Curve (AUC = {roc_auc_score(y_test_attr, test_probs_attr):.3f})')
            axes[0, 0].plot([0, 1], [0, 1], 'k--')
            axes[0, 0].set_xlabel('False Positive Rate')
            axes[0, 0].set_ylabel('True Positive Rate')
            axes[0, 0].set_title('Attractiveness Model - ROC Curve')
            axes[0, 0].legend()
            
            # Feature importance (if available)
            if hasattr(self.attractiveness_model.named_steps['classifier'], 'feature_importances_'):
                # This is simplified - in practice you'd need to map back to original features
                axes[0, 1].bar(range(10), sorted(self.attractiveness_model.named_steps['classifier'].feature_importances_)[-10:])
                axes[0, 1].set_title('Top 10 Feature Importances - Attractiveness')
                axes[0, 1].set_xlabel('Features')
                axes[0, 1].set_ylabel('Importance')
        
        # Data distribution visualizations
        axes[0, 2].hist(self.df['attractiveness_score'], bins=30, alpha=0.7)
        axes[0, 2].set_title('Attractiveness Score Distribution')
        axes[0, 2].set_xlabel('Attractiveness Score')
        axes[0, 2].set_ylabel('Frequency')
        
        # Compatibility model visualizations
        if test_preds_comp is not None:
            # Confusion matrix
            from sklearn.metrics import confusion_matrix
            cm = confusion_matrix(y_test_comp, test_preds_comp)
            sns.heatmap(cm, annot=True, fmt='d', ax=axes[1, 0])
            axes[1, 0].set_title('Compatibility Model - Confusion Matrix')
            axes[1, 0].set_xlabel('Predicted')
            axes[1, 0].set_ylabel('Actual')
        
        # Age difference vs compatibility
        if hasattr(self, 'user_pairs'):
            compatible_pairs = self.user_pairs[self.user_pairs['compatible'] == 1]
            incompatible_pairs = self.user_pairs[self.user_pairs['compatible'] == 0]
            
            axes[1, 1].hist(compatible_pairs['age_diff'], alpha=0.7, label='Compatible', bins=20)
            axes[1, 1].hist(incompatible_pairs['age_diff'], alpha=0.7, label='Incompatible', bins=20)
            axes[1, 1].set_title('Age Difference Distribution by Compatibility')
            axes[1, 1].set_xlabel('Age Difference')
            axes[1, 1].set_ylabel('Frequency')
            axes[1, 1].legend()
            
            # Compatibility rate by age difference
            age_comp = self.user_pairs.groupby(pd.cut(self.user_pairs['age_diff'], bins=10))['compatible'].mean()
            age_comp.plot(kind='bar', ax=axes[1, 2], rot=45)
            axes[1, 2].set_title('Compatibility Rate by Age Difference')
            axes[1, 2].set_xlabel('Age Difference Range')
            axes[1, 2].set_ylabel('Compatibility Rate')
        
        plt.tight_layout()
        plt.show()

# Usage example
def main():
    # Initialize analyzer
    analyzer = DatingProfileAnalyzer()
    
    # Load and process data
    df = analyzer.load_data('profiles.csv')  # Make sure you have this file
    analyzer.original_columns = list(df.columns)
    
    # Feature engineering
    analyzer.create_advanced_features()
    analyzer.create_attractiveness_target()
    analyzer.create_compatibility_pairs(n_pairs=2000)  # Reduced for memory efficiency
    
    # Build models
    X_test_attr, y_test_attr, test_preds_attr, test_probs_attr = analyzer.build_attractiveness_model()
    X_test_comp, y_test_comp, test_preds_comp = analyzer.build_compatibility_model()
    
    # Generate insights
    insights = analyzer.generate_insights()
    print("\n" + "="*50)
    print("INSIGHTS SUMMARY")
    print("="*50)
    for category, data in insights.items():
        print(f"\n{category.upper()}:")
        for key, value in data.items():
            print(f"  {key}: {value:.3f}" if isinstance(value, float) else f"  {key}: {value}")
    
    # Visualizations
    analyzer.visualize_results(
        X_test_attr, y_test_attr, test_probs_attr,
        X_test_comp, y_test_comp, test_preds_comp
    )
    
    return analyzer

# Run the enhanced analysis
if __name__ == "__main__":
    analyzer = main()

Dataset shape: (59946, 31)
Missing values per column:
age              0
body_type     5296
diet         24395
drinks        2985
drugs        14080
education     6628
essay0        5488
essay1        7572
essay2        9638
essay3       11476
dtype: int64
Creating advanced features...


In [35]:
# Initialize and run
analyzer = DatingProfileAnalyzer()
analyzer.load_data('profiles.csv')
analyzer.create_advanced_features()
analyzer.create_attractiveness_target()
analyzer.build_attractiveness_model()

# Make predictions
result = analyzer.predict_attractiveness(new_profile_data)
print(f"High attractiveness: {result['prediction']} (confidence: {result['confidence']:.2f})")

Dataset shape: (59946, 31)
Missing values per column:
age              0
body_type     5296
diet         24395
drinks        2985
drugs        14080
education     6628
essay0        5488
essay1        7572
essay2        9638
essay3       11476
dtype: int64
Creating advanced features...


MemoryError: 