In [8]:
import spacy
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter

In [9]:
nlp = spacy.load("en_core_web_sm")

In [10]:
# Function to process the problem statement
def process_problem_statement(statement):
    # Preprocessing
    doc = nlp(statement.lower())  # Convert to lowercase
    lemmatized_tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
    
    # Extract Named Entities (Potential Target and Features)pip install scikit-learn
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    
    # Extract important keywords using CountVectorizer
    vectorizer = CountVectorizer(stop_words='english')
    X = vectorizer.fit_transform([statement])
    keywords = vectorizer.get_feature_names_out()

    # Display results
    return {
        'lemmatized_tokens': lemmatized_tokens,
        'entities': entities,
        'keywords': keywords
    }

In [11]:
problem_statement = "Predict the house price based on the features such as size, location, and number of rooms"

processed_output = process_problem_statement(problem_statement)

print("Lemmatized Tokens:", processed_output['lemmatized_tokens'])
print("Named Entities:", processed_output['entities'])
print("Keywords:", processed_output['keywords'])


Lemmatized Tokens: ['predict', 'house', 'price', 'base', 'feature', 'size', 'location', 'number', 'room']
Named Entities: []
Keywords: ['based' 'features' 'house' 'location' 'number' 'predict' 'price' 'rooms'
 'size']


In [5]:
df = pd.read_csv("house_prices.csv")

In [16]:
df.head()

Unnamed: 0,id,Date,number of bedrooms,number of bathrooms,living area,lot area,number of floors,waterfront present,number of views,condition of the house,...,Built Year,Renovation Year,Postal Code,Lattitude,Longitude,living_area_renov,lot_area_renov,Number of schools nearby,Distance from the airport,Price
0,6762810145,42491,5,2.5,3650,9050,2.0,0,4,5,...,1921,0,122003,52.8645,-114.557,2880,5400,2,58,2380000
1,6762810635,42491,4,2.5,2920,4000,1.5,0,0,5,...,1909,0,122004,52.8878,-114.47,2470,4000,2,51,1400000
2,6762810998,42491,5,2.75,2910,9480,1.5,0,0,3,...,1939,0,122004,52.8852,-114.468,2940,6600,1,53,1200000
3,6762812605,42491,4,2.5,3310,42998,2.0,0,0,3,...,2001,0,122005,52.9532,-114.321,3350,42847,3,76,838000
4,6762812919,42491,3,2.0,2710,4500,1.5,0,0,4,...,1929,0,122006,52.9047,-114.485,2060,4500,1,51,805000


In [None]:
df.isnull().sum()

In [None]:
df.columns

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score, mean_squared_error, silhouette_score
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import pickle
import spacy
import re
import os

In [3]:
class AutoMLPipeline:
    def __init__(self):
        self.nlp = spacy.load('en_core_web_sm')
        self.models = {}
        self.problem_type = None
        self.target_column = None
        self.feature_columns = None
        
    def analyze_problem_statement(self, statement):
        doc = self.nlp(statement.lower())
        
        supervised_keywords = ['predict', 'classification', 'regression', 'forecast']
        unsupervised_keywords = ['cluster', 'group', 'segment', 'pattern']
        reinforcement_keywords = ['reward', 'action', 'agent', 'environment', 'policy']
        
        supervised_count = sum(1 for word in doc if any(keyword in word.text for keyword in supervised_keywords))
        unsupervised_count = sum(1 for word in doc if any(keyword in word.text for keyword in unsupervised_keywords))
        reinforcement_count = sum(1 for word in doc if any(keyword in word.text for keyword in reinforcement_keywords))
        
        if supervised_count > max(unsupervised_count, reinforcement_count):
            if any(word in statement.lower() for word in ['classify', 'category', 'class']):
                return 'classification'
            else:
                return 'regression'
        elif unsupervised_count > max(supervised_count, reinforcement_count):
            return 'unsupervised'
        else:
            return 'reinforcement'

    def identify_features(self, df, problem_statement):
        doc = self.nlp(problem_statement.lower())

        # Step 1: Try extracting target variable from the problem statement
        potential_targets = []
        for token in doc:
            if token.dep_ in ['dobj', 'pobj', 'attr', 'nsubj'] and token.text in df.columns:
                potential_targets.append(token.text)

        # Step 2: Use extracted target if found, otherwise apply heuristics
        if potential_targets:
            self.target_column = potential_targets[0]
        else:
            # Heuristic 1: If a column contains keywords like 'target', 'label', 'price', 'score', 'class'
            possible_target_columns = [col for col in df.columns if any(keyword in col.lower() 
                                                                        for keyword in ['target', 'label', 'price', 'score', 'class'])]
            if possible_target_columns:
                self.target_column = possible_target_columns[0]
            else:
                # Heuristic 2: If classification, choose column with few unique values
                if self.problem_type == 'classification':
                    self.target_column = df.nunique().idxmin()
                else:
                    # Heuristic 3: If regression, choose a numerical column with higher variance
                    num_columns = df.select_dtypes(include=['number']).columns
                    self.target_column = df[num_columns].var().idxmax() if not num_columns.empty else df.columns[-1]

        # Step 3: Set feature columns
        self.feature_columns = [col for col in df.columns if col != self.target_column]

        return self.target_column, self.feature_columns

    def preprocess_data(self, df):
        df = df.fillna(df.mean() if self.problem_type != 'classification' else df.mode().iloc[0])
        
        for column in df.columns:
            if df[column].dtype == 'object':
                le = LabelEncoder()
                df[column] = le.fit_transform(df[column].astype(str))
        
        scaler = StandardScaler()
        df_scaled = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)
        
        return df_scaled

    def train_models(self, X_train, X_test, y_train, y_test):
        if self.problem_type == 'classification':
            models = {
                'logistic_regression': LogisticRegression(),
                'random_forest': RandomForestClassifier()
            }
            
            nn_model = Sequential([
                Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
                Dense(32, activation='relu'),
                Dense(len(np.unique(y_train)), activation='softmax')
            ])
            nn_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
            
        elif self.problem_type == 'regression':
            models = {
                'linear_regression': LinearRegression(),
                'random_forest': RandomForestRegressor()
            }
            
            nn_model = Sequential([
                Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
                Dense(32, activation='relu'),
                Dense(1)
            ])
            nn_model.compile(optimizer='adam', loss='mse')
            
        elif self.problem_type == 'unsupervised':
            models = {
                'kmeans': KMeans(n_clusters=3)  # Adjust number of clusters as needed
            }
            
        results = {}
        for name, model in models.items():
            model.fit(X_train, y_train)
            if self.problem_type == 'classification':
                y_pred = model.predict(X_test)
                results[name] = accuracy_score(y_test, y_pred)
            elif self.problem_type == 'regression':
                y_pred = model.predict(X_test)
                results[name] = mean_squared_error(y_test, y_pred)
            else:
                predictions = model.predict(X_test)
                results[name] = silhouette_score(X_test, predictions)
        
            self.models[name] = model
            
        if self.problem_type in ['classification', 'regression']:
            nn_model.fit(X_train, y_train, epochs=10, batch_size=32, verbose=0)
            self.models['neural_network'] = nn_model
            
        return results

    def save_models(self, output_dir='saved_models'):
        os.makedirs(output_dir, exist_ok=True)
        
        for name, model in self.models.items():
            if 'neural_network' in name:
                model.save(f'{output_dir}/{name}.h5')
            else:
                with open(f'{output_dir}/{name}.pkl', 'wb') as f:
                    pickle.dump(model, f)

    def run_pipeline(self, dataset_path, problem_statement):
        df = pd.read_csv(dataset_path)
        
        self.problem_type = self.analyze_problem_statement(problem_statement)
        print(f"Detected problem type: {self.problem_type}")
        
        target_col, feature_cols = self.identify_features(df, problem_statement)
        print(f"Target column: {target_col}")
        print(f"Feature columns: {feature_cols}")
        
        df_processed = self.preprocess_data(df)
    
        X = df_processed[feature_cols]
        y = df_processed[target_col]       

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        
        results = self.train_models(X_train, X_test, y_train, y_test)

        self.save_models()
        
        return results


In [None]:
problem_statement = "Predict the house prices based on various features like size, location, and number of rooms"
dataset_path = "house_prices.csv"  # Replace with your dataset path

pipeline = AutoMLPipeline()
results = pipeline.run_pipeline(dataset_path, problem_statement)
print("\nModel Performance:")
for model, score in results.items():
    print(f"{model}: {score:.5f}")

In [6]:
summary = []

for col in df.columns:
    data = df[col]
    summary.append({
        'Feature': col,
        'Type': data.dtype,
        'Nulls': data.isnull().sum(),
        'Unique': data.nunique(),
        'Min': data.min() if pd.api.types.is_numeric_dtype(data) else None,
        'Max': data.max() if pd.api.types.is_numeric_dtype(data) else None,
        'Mean': data.mean() if pd.api.types.is_numeric_dtype(data) else None,
        'Mode': data.mode().iloc[0] if not data.mode().empty else None,
        'Top 5 Values': data.value_counts().head(5).to_dict()
    })

import pandas as pd
summary_df = pd.DataFrame(summary)

In [10]:
len(summary_df["Feature"])

23

In [11]:
a = ("abc","egc")
b = a[0]
print(b)

abc
