<a href="https://colab.research.google.com/github/PhiWhyyy/SolarData-Sept-2017-/blob/main/solar_sc24.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_selection import SelectKBest, f_classif

# Memory-efficient data loading and preprocessing
def load_and_preprocess_data(file_path):
    # Use low_memory to reduce initial memory load
    df = pd.read_excel(file_path, engine='openpyxl', low_memory=True)

    # Print initial dataset info
    print("Initial Dataset Info:")
    print(f"Original Shape: {df.shape}")
    print("\nColumn Types:")
    print(df.dtypes)

    # Remove columns with high missing values
    threshold = 0.7  # Remove columns with more than 70% missing values
    df = df.dropna(thresh=len(df) * (1 - threshold), axis=1)

    # Convert memory-heavy columns to more efficient types
    for col in df.select_dtypes(include=['float64']).columns:
        df[col] = pd.to_numeric(df[col], downcast='float')

    for col in df.select_dtypes(include=['int64']).columns:
        df[col] = pd.to_numeric(df[col], downcast='integer')

    # Handle categorical columns
    categorical_cols = df.select_dtypes(include=['object']).columns
    for col in categorical_cols:
        # Convert to category type to reduce memory
        df[col] = df[col].astype('category')

    return df

def prepare_model_data(df):
    # Identify target column (adjust as needed)
    # Assuming 'Column45' is your target variable
    target_column = 'Column45'

    # Separate features and target
    X = df.drop(columns=[target_column])
    y = df[target_column]

    # Encode categorical features
    categorical_cols = X.select_dtypes(include=['category', 'object']).columns
    le = LabelEncoder()
    for col in categorical_cols:
        X[col] = le.fit_transform(X[col].astype(str))

    # Feature selection to reduce dimensions
    selector = SelectKBest(f_classif, k=min(20, X.shape[1]))
    X = selector.fit_transform(X, y)

    return X, y

def train_model(X, y):
    # Split the data with stratification
    X_train, X_val, y_train, y_val = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)

    # Memory-efficient Random Forest
    model = RandomForestClassifier(
        n_estimators=100,  # Reduced number of trees
        max_depth=10,      # Limit tree depth
        random_state=42,
        n_jobs=-1,         # Use all available cores
        max_features='sqrt'# Reduce features considered at each split
    )

    # Train the model
    model.fit(X_train_scaled, y_train)

    # Evaluate
    y_pred = model.predict(X_val_scaled)

    print("\nModel Evaluation:")
    print("Accuracy:", accuracy_score(y_val, y_pred))
    print("\nClassification Report:")
    print(classification_report(y_val, y_pred))

    return model

def main():
    # File path - ensure this is correct
    file_path = r"/content/Solar Data Sept 2017.xlsx"

    try:
        # Load and preprocess data
        df = load_and_preprocess_data(file_path)

        # Prepare data for modeling
        X, y = prepare_model_data(df)

        # Train and evaluate model
        model = train_model(X, y)

    except Exception as e:
        print("An error occurred:")
        print(e)
        import traceback
        traceback.print_exc()

# Run the main function
if __name__ == "__main__":
    main()

An error occurred:
read_excel() got an unexpected keyword argument 'low_memory'


Traceback (most recent call last):
  File "<ipython-input-6-f1a2f6441130>", line 99, in main
    df = load_and_preprocess_data(file_path)
  File "<ipython-input-6-f1a2f6441130>", line 12, in load_and_preprocess_data
    df = pd.read_excel(file_path, engine='openpyxl', low_memory=True)
TypeError: read_excel() got an unexpected keyword argument 'low_memory'


In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_selection import SelectKBest, f_classif

def load_and_preprocess_data(file_path):
    # Load Excel file without the low_memory argument
    df = pd.read_excel(file_path, engine='openpyxl')

    # Print initial dataset info
    print("Initial Dataset Info:")
    print(f"Original Shape: {df.shape}")
    print("\nColumn Types:")
    print(df.dtypes)

    # Remove columns with high missing values
    threshold = 0.7  # Remove columns with more than 70% missing values
    df = df.dropna(thresh=len(df) * (1 - threshold), axis=1)

    # Convert memory-heavy columns to more efficient types
    for col in df.select_dtypes(include=['float64']).columns:
        df[col] = pd.to_numeric(df[col], downcast='float')

    for col in df.select_dtypes(include=['int64']).columns:
        df[col] = pd.to_numeric(df[col], downcast='integer')

    # Handle categorical columns
    categorical_cols = df.select_dtypes(include=['object']).columns
    for col in categorical_cols:
        # Convert to category type to reduce memory
        df[col] = df[col].astype('category')

    return df

def prepare_model_data(df):
    # Identify target column (adjust as needed)
    # Assuming 'Column45' is your target variable
    target_column = 'Column45'

    # Check if target column exists
    if target_column not in df.columns:
        print("Available columns:", list(df.columns))
        raise ValueError(f"Target column '{target_column}' not found in the DataFrame")

    # Separate features and target
    X = df.drop(columns=[target_column])
    y = df[target_column]

    # Encode categorical features
    categorical_cols = X.select_dtypes(include=['category', 'object']).columns
    le = LabelEncoder()
    for col in categorical_cols:
        X[col] = le.fit_transform(X[col].astype(str))

    # Feature selection to reduce dimensions
    selector = SelectKBest(f_classif, k=min(20, X.shape[1]))
    X = selector.fit_transform(X, y)

    return X, y

def train_model(X, y):
    # Split the data with stratification
    X_train, X_val, y_train, y_val = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)

    # Memory-efficient Random Forest
    model = RandomForestClassifier(
        n_estimators=100,  # Reduced number of trees
        max_depth=10,      # Limit tree depth
        random_state=42,
        n_jobs=-1,         # Use all available cores
        max_features='sqrt'# Reduce features considered at each split
    )

    # Train the model
    model.fit(X_train_scaled, y_train)

    # Evaluate
    y_pred = model.predict(X_val_scaled)

    print("\nModel Evaluation:")
    print("Accuracy:", accuracy_score(y_val, y_pred))
    print("\nClassification Report:")
    print(classification_report(y_val, y_pred))

    return model

def main():
    # File path - ensure this is correct
    file_path = r"/content/Solar Data Sept 2017.xlsx"

    try:
        # Load and preprocess data
        df = load_and_preprocess_data(file_path)

        # Prepare data for modeling
        X, y = prepare_model_data(df)

        # Train and evaluate model
        model = train_model(X, y)

    except Exception as e:
        print("An error occurred:")
        print(e)
        import traceback
        traceback.print_exc()

# Run the main function
if __name__ == "__main__":
    main()

Initial Dataset Info:
Original Shape: (21602, 177)

Column Types:
Column1      object
Column2      object
Column3      object
Column4      object
Column5      object
              ...  
Column173    object
Column174    object
Column175    object
Column176    object
Column177    object
Length: 177, dtype: object
An error occurred:
The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.


  f = msb / msw
Traceback (most recent call last):
  File "<ipython-input-7-732110948fb4>", line 109, in main
    model = train_model(X, y)
  File "<ipython-input-7-732110948fb4>", line 66, in train_model
    X_train, X_val, y_train, y_val = train_test_split(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/utils/_param_validation.py", line 213, in wrapper
    return func(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_split.py", line 2806, in train_test_split
    train, test = next(cv.split(X=arrays[0], y=stratify))
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_split.py", line 1843, in split
    for train, test in self._iter_indices(X, y, groups):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_split.py", line 2252, in _iter_indices
    raise ValueError(
ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be 

In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

def load_and_preprocess_data(file_path):
    # Load Excel file
    df = pd.read_excel(file_path, engine='openpyxl')

    print("Initial Dataset Info:")
    print(f"Original Shape: {df.shape}")

    # Detailed column type and missing value analysis
    print("\nColumn Types and Non-Null Counts:")
    print(df.info())

    # Remove columns with too many missing values
    df = df.dropna(thresh=len(df) * 0.5, axis=1)

    return df

def prepare_model_data(df):
    # Determine target column (you may need to adjust this)
    # Typically, the last column is used as the target
    target_column = df.columns[-1]

    print(f"\nUsing {target_column} as target variable")

    # Separate features and target
    X = df.drop(columns=[target_column])
    y = df[target_column]

    # Encode categorical features
    categorical_cols = X.select_dtypes(include=['object']).columns
    le = LabelEncoder()

    for col in categorical_cols:
        # Only encode if column has multiple unique values
        unique_values = X[col].nunique()
        if unique_values > 1:
            X[col] = le.fit_transform(X[col].astype(str))
        else:
            print(f"Dropping column {col} due to insufficient unique values")
            X = X.drop(columns=[col])

    # Encode target variable
    y = le.fit_transform(y.astype(str))

    # Check class distribution
    unique, counts = np.unique(y, return_counts=True)
    print("\nTarget Variable Distribution:")
    for u, c in zip(unique, counts):
        print(f"Class {u}: {c} samples")

    return X, y

def train_model(X, y):
    # Handling extremely imbalanced data
    from sklearn.utils.class_weight import compute_class_weight

    # Compute class weights
    class_weights = compute_class_weight(
        class_weight='balanced',
        classes=np.unique(y),
        y=y
    )
    class_weight_dict = dict(zip(np.unique(y), class_weights))

    # Split data with adjusted stratification
    X_train, X_val, y_train, y_val = train_test_split(
        X, y,
        test_size=0.2,
        random_state=42,
        stratify=y
    )

    # Reduce dimensionality if needed
    from sklearn.decomposition import PCA
    pca = PCA(n_components=min(20, X.shape[1]))
    X_train_reduced = pca.fit_transform(X_train)
    X_val_reduced = pca.transform(X_val)

    # Memory-efficient Random Forest with class weights
    model = RandomForestClassifier(
        n_estimators=100,
        max_depth=10,
        random_state=42,
        class_weight=class_weight_dict,
        n_jobs=-1,
        max_features='sqrt'
    )

    # Train the model
    model.fit(X_train_reduced, y_train)

    # Evaluate
    y_pred = model.predict(X_val_reduced)

    print("\nModel Evaluation:")
    print("Accuracy:", accuracy_score(y_val, y_pred))
    print("\nClassification Report:")
    print(classification_report(y_val, y_pred))

    return model

def main():
    # File path
    file_path = r"/content/Solar Data Sept 2017.xlsx"

    try:
        # Load and preprocess data
        df = load_and_preprocess_data(file_path)

        # Prepare data for modeling
        X, y = prepare_model_data(df)

        # Train and evaluate model
        model = train_model(X, y)

    except Exception as e:
        print("An error occurred:")
        print(e)
        import traceback
        traceback.print_exc()

# Run the main function
if __name__ == "__main__":
    main()

Initial Dataset Info:
Original Shape: (21602, 177)

Column Types and Non-Null Counts:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21602 entries, 0 to 21601
Columns: 177 entries, Column1 to Column177
dtypes: object(177)
memory usage: 29.2+ MB
None

Using Column177 as target variable

Target Variable Distribution:
Class 0: 1 samples
Class 1: 21599 samples
Class 2: 1 samples
Class 3: 1 samples
An error occurred:
The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.


Traceback (most recent call last):
  File "<ipython-input-8-57e9a5e5eb06>", line 120, in main
    model = train_model(X, y)
  File "<ipython-input-8-57e9a5e5eb06>", line 72, in train_model
    X_train, X_val, y_train, y_val = train_test_split(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/utils/_param_validation.py", line 213, in wrapper
    return func(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_split.py", line 2806, in train_test_split
    train, test = next(cv.split(X=arrays[0], y=stratify))
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_split.py", line 1843, in split
    for train, test in self._iter_indices(X, y, groups):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_split.py", line 2252, in _iter_indices
    raise ValueError(
ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.


In [None]:
pip install pandas openpyxl



In [None]:
import pandas as pd

In [None]:
pip install ipywidgets




In [None]:
file_path = r"/content/Solar Data Sept 2017.xlsx"
df = pd.read_excel(file_path)

# Display the first few rows of the data
print(df.head())

                                       Column1               Column2  \
0                                        query                 T_REC   
1                                       string                  time   
2                                           %s                    %s   
3  aia.lev1_euv_12s[2017-09-19T23:59:59Z][171]  2017-09-19T23:59:59Z   
4  aia.lev1_euv_12s[2017-09-19T23:59:59Z][211]  2017-09-19T23:59:59Z   

                   Column3   Column4               Column5    Column6  \
0                    T_OBS  WAVELNTH                  DATE        FSN   
1                      TBD       int                  time        int   
2                      TBD        %d                    %s         %d   
3  2017-09-20T00:00:10.35Z       171  2017-09-26T14:05:10Z  158488054   
4  2017-09-19T23:59:59.07Z       211  2017-09-25T14:17:16Z  158488047   

    Column7     Column8       Column9  Column10  ... Column168 Column169  \
0   EXPTIME     QUALITY        ORIGIN  TELESCOP  ...

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [None]:
df = pd.read_excel("/content/Solar Data Sept 2017.xlsx")
print(df.columns)

Index(['Column1', 'Column2', 'Column3', 'Column4', 'Column5', 'Column6',
       'Column7', 'Column8', 'Column9', 'Column10',
       ...
       'Column168', 'Column169', 'Column170', 'Column171', 'Column172',
       'Column173', 'Column174', 'Column175', 'Column176', 'Column177'],
      dtype='object', length=177)


In [None]:
X = df.drop(columns=["Column4"])  #Preprossesing the data
y = df["Column45"]

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
for column in X.select_dtypes(include=["object"]).columns:
  le = LabelEncoder()
  X[column] = le.fit_transform(X[column])
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.33, random_state=42) #splitting the data

In [None]:
#Training
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)