In [6]:
import pandas as pd

# Load the dataset with specified encoding
data = pd.read_csv('Large language models (2024).csv', encoding='latin-1')

# Display the first few rows of the dataset
print(data.head())

# Check the data types and missing values
print(data.info())

# Summary statistics
print(data.describe())

# Convert 'Parameters' and 'Tokens' columns to numeric data type
data['Parameters'] = pd.to_numeric(data['Parameters'], errors='coerce')

# Handle missing values in 'Parameters' column
data['Parameters'].fillna(data['Parameters'].median(), inplace=True)

# Convert 'ALScore' column to numeric data type
data['ALScore'] = pd.to_numeric(data['ALScore'], errors='coerce')

# Handle missing values in 'ALScore' column by dropping rows with missing ALScore values
data.dropna(subset=['ALScore'], inplace=True)

# Verify changes
print(data.info())


                    Model Comapany Arch Parameters Tokens   Ratio ALScore  \
0                 Olympus   Amazon  TBA       2000   40000  20:01    29.8   
1                   GPT-5   OpenAI  TBA       2000     TBA    TBA     TBA   
2                   GPT-6   OpenAI  TBA        TBA     TBA    TBA     TBA   
3  AuroraGPT (ScienceGPT)      ANL  TBA       1000     TBA    TBA     TBA   
4                  Grok-2      xAI  TBA        TBA     TBA    TBA     TBA   

  Training dataset Release Date  \
0              TBA          TBA   
1              TBA          TBA   
2              TBA          TBA   
3              TBA          TBA   
4              TBA          TBA   

                                               Notes  \
0  New related Titan details: '$65m training run....   
1                                          Due 2024.   
2                                          Due 2025.   
3  https://tpc.dev/2023/11/10/tpc-announced-with-...   
4                                          Due

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Split the data into features (X) and target variable (y)
X = data[['Parameters']]
y = data['ALScore']

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Choose a regression model (e.g., Linear Regression)
model = LinearRegression()

# Train the model on the training data
model.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error (MSE): {mse}')
print(f'R-squared (R^2) Score: {r2}')


Mean Squared Error (MSE): 8.387408816784339
R-squared (R^2) Score: 0.6729798015787776
