In [None]:
from sklearn.preprocessing import MinMaxScaler
import pandas as pd

def clean_data(df):
    df = df[(df['AnalystRatingStrongBuy'] != "-") | (df['AnalystRatingBuy'] != "-") | (df['AnalystRatingHold'] != "-") | (df['AnalystRatingStrongSell'] != "-") | (df['AnalystRatingSell'] != "-") ]
    # Change column type to int32 for column: 'AnalystRatingStrongBuy'
    df = df.astype({'AnalystRatingStrongBuy': 'int32', 'AnalystRatingBuy': 'int32', 'AnalystRatingHold': 'int32', 'AnalystRatingSell': 'int32', 'AnalystRatingStrongSell': 'int32'})
    # Add a column with the sum of total analysts after the 'AnalystTargetPrice' column
    df.insert(
        df.columns.get_loc('AnalystTargetPrice') + 1,  # Position after 'AnalystTargetPrice'
        'TotalAnalysts',
        df['AnalystRatingStrongBuy'] +
        df['AnalystRatingBuy'] +
        df['AnalystRatingHold'] +
        df['AnalystRatingSell'] +
        df['AnalystRatingStrongSell']
    )
    # Calculate weighted average analyst opinion
    df['AnalystConsensus'] = (
        (df['AnalystRatingStrongSell'] * 1 +
         df['AnalystRatingSell'] * 2 +
         df['AnalystRatingHold'] * 3 +
         df['AnalystRatingBuy'] * 4 +
         df['AnalystRatingStrongBuy'] * 5) /
        df['TotalAnalysts']
    )
    # Move 'AnalystConsensus' column behind 'TotalAnalysts'
    columns = df.columns.tolist()
    columns.remove('AnalystConsensus')
    total_analysts_index = columns.index('TotalAnalysts')
    columns.insert(total_analysts_index + 1, 'AnalystConsensus')
    df = df[columns]
    # Normalize 'AnalystConsensus' column and append normalized values
    scaler = MinMaxScaler()
    df['AnalystConsensusNorm'] = scaler.fit_transform(df[['AnalystConsensus']])*5
    # Move 'AnalystConsensusNorm' column to be directly after 'AnalystConsensus'
    columns = df.columns.tolist()
    analyst_consensus_index = columns.index('AnalystConsensus')
    columns.insert(analyst_consensus_index + 1, columns.pop(columns.index('AnalystConsensusNorm')))
    df = df[columns]
    # Drop column: 'Description'
    df = df.drop(columns=['Description','AssetType','Currency', 'Country', 'Address', 'OfficialSite', 'TrailingPE'])
    # Drop column: 'ExDividendDate'
    df = df.drop(columns=['ExDividendDate','DividendDate'])
    # Create a categorical variable from AnalystConsensus
    bins = [0, 2.5, 3.25, 3.75, 4, 5]
    labels = ['Very Poor', 'Poor', 'Average', 'Good', 'Excellent']
    df['AnalystConsensusCategory'] = pd.cut(df['AnalystConsensus'], bins=bins, labels=labels, include_lowest=True)
    # Divide AnalystConsensus into 5 equal groups with range labels
    df['AnalystConsensusEqCat'] = pd.qcut(
        df['AnalystConsensus'], 
        q=5, 
        labels=pd.IntervalIndex.from_breaks(
            pd.qcut(df['AnalystConsensus'], q=5, retbins=True)[1], 
            closed='right'
        ).astype(str)
    )
    return df

# Loaded variable 'df' from URI: c:\Users\SergioBeamonteGonzal\Documentos Locales\MASTER\Machine Learning\company_metrics.csv
df = pd.read_csv(r'c:\Users\SergioBeamonteGonzal\Documentos Locales\MASTER\Machine Learning\company_metrics.csv')

df_clean = clean_data(df.copy())
df_clean.head()