In [49]:
import pandas as pd

# Function to read the column names from the first line of the CSV file
def read_star_column_name(file_path):
    with open(file_path, "r", encoding="ISO-8859-1") as file:
        first_line = file.readline().strip()

    column_names = [name.strip() for name in first_line.split(",")]

    return column_names

# Adjust the file path to your dataset location
file_path = "C:/Users/USER/Documents/GitHub/Infinite_Stratos_Project/data/xxx_dataset.csv"

# Read the column names from the dataset
column_names = read_star_column_name(file_path)

# Load the CSV file into a DataFrame using the correct column names
df = pd.read_csv(
    file_path,
    header=None,  # Since we're specifying the column names manually
    names=column_names,
    sep=",",  # Assuming it's comma-separated
    engine="python"
)

# Display the first few rows to verify everything works as expected
print(df.head())

# Handle missing values in the dataset
df.dropna(inplace=True)

# Drop rows with "?" in any column
for column in df.columns:
    df.drop(df[df[column] == "?"].index, inplace=True)

# Convert object columns to numeric
def map_data(x):
    convert = x.unique()
    return x.map(dict(zip(convert, range(1, len(convert) + 1))))

for column in df.columns:
    if df[column].dtype == "object":
        df[column] = map_data(df[column])

# Convert the entire dataframe to integers
df = df.astype(int)

# Split the data into training features and target variable
X_train = df.drop("Temperature", axis=1)  # Drop 'Budget_USD' column by name
y_train = df["Temperature"]  # Target variable is the 'Budget_USD' column

# Prepare the test data (similar steps for the test data)
df_test = pd.read_csv(
    "C:/Users/USER/Documents/GitHub/Infinite_Stratos_Project/data/xxx_dataset.csv",
    header=None,
    names=column_names,
    engine="python",
)

# Handle missing values in the test dataset
df_test.dropna(inplace=True)

# Drop rows with "?" in any column for the test dataset
for column in df_test.columns:
    df_test.drop(df_test[df_test[column] == "?"].index, inplace=True)

# Convert object columns in the test dataset to numeric
for column in df_test.columns:
    if df_test[column].dtype == "object":
        df_test[column] = map_data(df_test[column])

# Convert the test data to integers
df_test = df_test.astype(int)

# Print the test dataset information
print(df_test.info())

# Split the test data into features and target
X_test = df_test.drop("Temperature", axis=1)  # Drop 'Budget_USD' column from test data
y_test = df_test["Temperature"]  # Target variable in the test data


   Temperature       L       R    A_M  Color  Spectral_Class  Type
0  Temperature       L       R    A_M  Color  Spectral_Class  Type
1         3068  0.0024    0.17  16.12    Red               M     0
2         3042  0.0005  0.1542   16.6    Red               M     0
3         2600  0.0003   0.102   18.7    Red               M     0
4         2800  0.0002    0.16  16.65    Red               M     0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 241 entries, 0 to 240
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype
---  ------          --------------  -----
 0   Temperature     241 non-null    int64
 1   L               241 non-null    int64
 2   R               241 non-null    int64
 3   A_M             241 non-null    int64
 4   Color           241 non-null    int64
 5   Spectral_Class  241 non-null    int64
 6   Type            241 non-null    int64
dtypes: int64(7)
memory usage: 13.3 KB
None


In [50]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import joblib
from sklearn.model_selection import cross_val_score

# Perform cross-validation on the training set

# Function to read the column names from the first line of the CSV file
def read_star_column_name(file_path):
    with open(file_path, "r", encoding="ISO-8859-1") as file:
        first_line = file.readline().strip()

    column_names = [name.strip() for name in first_line.split(",")]

    return column_names

# Adjust the file path to your dataset location
file_path = "C:/Users/USER/Documents/GitHub/Infinite_Stratos_Project/data/xxx_dataset.csv"

# Read the column names from the dataset
column_names = read_star_column_name(file_path)

# Load the CSV file into a DataFrame using the correct column names
df = pd.read_csv(
    file_path,
    header=None,  # Since we're specifying the column names manually
    names=column_names,
    sep=",",  # Assuming it's comma-separated
    engine="python"
)

# Handle missing values in the dataset
df.dropna(inplace=True)

# Drop rows with "?" in any column
for column in df.columns:
    df.drop(df[df[column] == "?"].index, inplace=True)

# Convert object columns to numeric
def map_data(x):
    convert = x.unique()
    return x.map(dict(zip(convert, range(1, len(convert) + 1))))

for column in df.columns:
    if df[column].dtype == "object":
        df[column] = map_data(df[column])

# Convert the entire dataframe to integers
df = df.astype(int)

# Split the data into training features and target variable
X = df.drop("Temperature", axis=1)  # Drop 'Temperature' column
y = df["Temperature"]  # Target variable is the 'Temperature' column

# **Fix the split issue**
# Remove classes with fewer than 2 samples to avoid stratification errors
value_counts = y.value_counts()
to_remove = value_counts[value_counts < 2].index
df_filtered = df[~df["Temperature"].isin(to_remove)]

X = df_filtered.drop(columns=["Temperature"])
y = df_filtered["Temperature"]

# Proper train-test split (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=11, random_state=42, stratify=y
)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create and train the RandomForest model
model = RandomForestClassifier(
    n_estimators=100, random_state=50, class_weight="balanced"
)
model.fit(X_train_scaled, y_train)
cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=2)
# Predict on the test set
rfc_result_trained = model.predict(X_test_scaled)

# Calculate accuracy and print the classification report
accuracy = accuracy_score(y_test, rfc_result_trained) * 100
print(f"Test accuracy: {accuracy:.4f} %")
print("\nClassification Report:")
print(classification_report(y_test, rfc_result_trained))

# Save the model and scaler
joblib.dump(model, "C:/Users/USER/Documents/GitHub/Infinite_Stratos_Project/exported_model/rf/rf_Budget_model.pkl")
joblib.dump(scaler, "C:/Users/USER/Documents/GitHub/Infinite_Stratos_Project/exported_model/rf/rf_Budget_scaler.pkl")




Test accuracy: 45.4545 %

Classification Report:
              precision    recall  f1-score   support

           4       1.00      1.00      1.00         1
           9       0.50      1.00      0.67         1
          11       0.00      0.00      0.00         1
          47       0.00      0.00      0.00         1
          54       1.00      1.00      1.00         1
          62       0.50      1.00      0.67         1
          69       0.00      0.00      0.00         1
         119       0.00      0.00      0.00         1
         132       0.00      0.00      0.00         1
         133       0.00      0.00      0.00         1
         182       0.50      1.00      0.67         1

    accuracy                           0.45        11
   macro avg       0.32      0.45      0.36        11
weighted avg       0.32      0.45      0.36        11



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


['C:/Users/USER/Documents/GitHub/Infinite_Stratos_Project/exported_model/rf/rf_Budget_scaler.pkl']