In [1]:
!pip install xgboost



In [2]:
import os
import numpy as np
import pandas as pd
from PIL import Image
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import cv2
from sklearn.ensemble import GradientBoostingRegressor
import xgboost as xg
from xgboost import XGBRegressor
from sklearn.linear_model import ElasticNet, Lasso,  BayesianRidge,LassoLarsIC
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV

## We will grayscale and reduce the image size to 28x28 to reduce the dimensionality as we are gonna flattening them.

In [3]:
def load_images_from_folder(folder_path, resize_shape=(28, 28)):
    images = []
    labels = []
    invalid_files = 0
    for filename in os.listdir(folder_path):
        img_path = os.path.join(folder_path, filename)
        if os.path.isfile(img_path):
            exception_occurred = False
            label = None  # Initialize label outside the try block
            
            try:
                img = Image.open(img_path).convert('L')  # Convert to grayscale to reduce dimensionality
                img = img.resize(resize_shape)  # Resize images to 28x28 pixels to reduce dimensionality as we will flatten them 
                img_array = np.array(img)
                label = int(filename.split('_')[0])
            except ValueError:
                invalid_files += 1
                exception_occurred = True
            
            if not exception_occurred: 
                # there are some invalid name files, it's your choice you can rename them if you want, you will get more training data :)))
                images.append(img_array.flatten())
                labels.append(label)
                
    return np.array(images), np.array(labels)

In [4]:
folder_path = '/kaggle/input/opencode-23-kaggle-competition/train/Regression'

In [5]:
X, y = load_images_from_folder(folder_path)
X.shape, y.shape


((4219, 784), (4219,))

In [6]:
X.dtype

dtype('uint8')

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = .20, random_state = 427)
X_train.shape, y_train.shape,X_test.shape, y_test.shape

((3375, 784), (3375,), (844, 784), (844,))

In [8]:
y_train[:5]

array([ 50,   5, 130,  25,  50])

In [9]:
SEED = 42

In [10]:
def rmse(predictions, targets):
    return np.sqrt(((predictions - targets) ** 2).mean())

In [11]:
from sklearn.ensemble import VotingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
model2 = xg.XGBRegressor(n_estimators=1000, max_depth=20, eta=0.01, subsample=0.8,nthread=4, colsample_bytree=0.6,min_child_weight = 6,
                        reg_alpha = 0.1,scale_pos_weight=1,seed=42,reg_lambda = 0.005) #best as of now
model3 = lgb.LGBMRegressor(
    objective= 'regression',
    metric='rmse',
    num_leaves= 100,
    learning_rate= 0.1,
    max_bin=500,
    boosting_type='dart',
    num_iterations=3000,
    reg_alpha=0.1,
    reg_lambda=0.01
)

ensemble = [
    ('xgb', model2),
    ('lgb', model3)
]

voting_regressor = VotingRegressor(ensemble)

voting_regressor.fit(X_train, y_train)



In [12]:
y_pred = voting_regressor.predict(X_test)
error = rmse(y_pred,y_test)

In [13]:
print(f'Root Mean Square Error is: {error}')
#On submission, the error reduced to 12.3!

Root Mean Square Error is: 12.430358338967396


In [14]:
sample_df = pd.read_csv('/kaggle/input/opencode-23-kaggle-competition/sample_submission.csv')
sample_df

Unnamed: 0,id,target
0,1111111112,100
1,1111111111,5
2,2222222222,25


In [15]:
def make_predictions(model, folder_path, resize_shape=(28, 28)):
    images, filenames = [], []
    for filename in os.listdir(folder_path):
        img_path = os.path.join(folder_path, filename)
        if os.path.isfile(img_path):
            try:
                # Extract the numeric part of the filename
                image_id = int(filename.split('.')[0])  

                img = Image.open(img_path).convert('L')  # Convert to grayscale to reduce dimensionality
                img = img.resize(resize_shape)  # Resize images to 28x28 pixels to reduce dimensionality as we will flatten them 
                img_array = np.array(img)
                images.append(img_array.flatten())
                filenames.append(image_id)
            except ValueError:
                print(f"Invalid file: {filename}")

    # Make predictions
    predictions = voting_regressor.predict(images)

    # Create a DataFrame
    df = pd.DataFrame({'id': filenames, 'target': predictions})

    return df


In [16]:
test_folder_path = "/kaggle/input/opencode-23-kaggle-competition/test/Regression"
result_df = make_predictions(voting_regressor, test_folder_path)
result_df

Unnamed: 0,id,target
0,1477285848,18.500121
1,1477845390,71.248429
2,1477288176,10.285869
3,1477789434,24.704851
4,1477285836,17.352281
...,...,...
1804,1477838988,59.387066
1805,1477852950,87.777913
1806,1479344748,108.977002
1807,1479344712,107.727479


In [17]:
result_df.to_csv('submission.csv',index = False)

In [18]:
sub = pd.read_csv('submission.csv')
sub

Unnamed: 0,id,target
0,1477285848,18.500121
1,1477845390,71.248429
2,1477288176,10.285869
3,1477789434,24.704851
4,1477285836,17.352281
...,...,...
1804,1477838988,59.387066
1805,1477852950,87.777913
1806,1479344748,108.977002
1807,1479344712,107.727479


In [19]:
import joblib
joblib.dump(voting_regressor,'RegressionModel.pkl')

['RegressionModel.pkl']