In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error,r2_score

# importing from ensemble module to check how the other algorithm react to the dataset
# regrsor used instead of classifier as we have to predict a number i.e. continuous value where classifier predicts a category 
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor # these all follows scikit learn rules also includes bagging classifier

# this comes with the xgboost library and is built for high performance
from xgboost import XGBRegressor 

# to improve efficiency using hypertuning paramter
from sklearn.model_selection import GridSearchCV

In [None]:
df =pd.read_csv('cleaned_encoded.csv')
print(list(df.columns))
if 'actors' in df.columns:
    print("yes")
else:
    print("no")

In [None]:
# getting the  target and features for modelling

#dropping the unwanted
df=df.drop(['Name','Year','Duration','Votes','Genre','Actor 1','Actor 2','Actor 3','Director','Genre_list','actors'],axis=1)

# separate x and y
x=df.drop('Rating',axis=1) # features
y=df['Rating'] # target


print(df.shape)


In [None]:
# now trainng and testing split
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test= train_test_split(x,y,test_size=0.3, random_state=42)
print(f"Train shape: {x_train.shape}")
print(f"Test shape: {x_test.shape}")
print(x_train.dtypes)

In [None]:
# visualize this wrt rating
'''
this is done to check if the datas are skewed or not
through histogram we saw the plot and both are bell shaped so no skew i.e. symmetric
if tail longer on right = positive skewd
if tail longer on left = negative skewed

'''
plt.figure(figsize=(10,4)) # creats canvas
plt.subplot(1,2,1) # grid of 1 row 2 column and working on 1st subplot 
plt.hist(y_train, bins=20, color='skyblue', edgecolor='black')
# this plt.hist() only plots the bars and doesnot include kde by default. for including we have to use seaborn
# plt.title('Train Rating')
plt.xlabel('Rating')
plt.ylabel('count') # this is number of data at each bins 
sns.histplot(df['Rating'], kde =True)



plt.subplot(1,2,2) # now moving to subplot 2
plt.hist(y_test, bins=20, color='skyblue', edgecolor='black')
plt.title('Test Rating')
plt.xlabel('Rating')
plt.ylabel('count')
sns.histplot(df['Rating'], kde =True)

# to show the skewness numerically
print(df['Rating'].skew())  # this resulted in value less than 0.5 so it is not mild 

plt.tight_layout() # this brings the space between the 2 subplots so that they dont overlapp
plt.show()


In [None]:
# building model using simple linear regression
'''
linear regression is the simple model that fits a straight line to predict our movie rating
.fit() trains the model using the given datas it uses the mathematical optimization techniques and
caluclate the best coefficients that minimize the prediction error and store this learned/ trained parameters for later use

'''


model=LinearRegression() # this creates the empty model object ready to learn from the given datasets

model.fit(x_train, y_train)  

# coeff of each feature
print(model.coef_) 

# intercept or bias term
print( model.intercept_)

# prediction value
y_pred=model.predict(x_test)  
'''
this uses the trained model to predict on the new unseen and untrained data sets.
here predict applies the learned equation  with the learned coefficient and intercept
y_pred stores the arrays  y=b0+b1 x1+b2x2+b3x3 and so on is the simple equation on which linear regression works

'''

# now checking the errors between the predicton and the actual one

print(" mean absolute error :", mean_absolute_error(y_test,y_pred))   # lower mse good model

print("mean squared error: ",mean_squared_error(y_test,y_pred))
# mse is squared form of the target variable so rmse is used to make that more interpreatable and compare directly
'''
print("rmse: ", mean_squared_error(y_test, y_pred,squared=False )) # this squared false means to not return mse which is false but to return the square root of mse
this is not supported by this version of scikit learn
so we have to do manuually
'''
mse=mean_squared_error(y_test,y_pred)
rmse=np.sqrt(mse)
print("rmse: ",rmse)
# now checking r2 square
print("r2 score:", r2_score(y_test, y_pred))





In [None]:
# visualizing residuals to detect outliers
residuals= y_test- y_pred 
# this creates a scatter plot  
plt.scatter(y_pred, residuals) # (x axis, y axis)

# this line below draws the horizontal line at y=0 i.e. the position where prediction is perfect i.e. 0 error
plt. axhline(y=0, color='b', linestyle='--') 

plt.xlabel("predicted value from model")
plt.ylabel('residuals')
plt.title(" residue plot to detect outlier")
plt.show()

In [None]:
# visualizing actual and predicted data to know where is model struggling
plt.figure(figsize=(10, 6)) # creates canvas

# scatter plot using seaborn 0.6 means opacity so that overlapping points are still visible
sns.scatterplot(x=y_test, y=y_pred, alpha=0.6)

# this plots the perfect prediction line as diagonal y=x because here the x axis and y axis moves from min to max value forming the diagonal like structure
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], '--r', label='Ideal fit of the model')

plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Actual vs Predicted Values')
plt.legend()  # this is added for reference as it describes the labeled elements in the plot using their label
plt.grid(True)  # this forms the gird so it will be easier to align the points to axes
plt.show()

In [None]:
# using 3 different tree models
''' 
instead of doing each one by one and seeing the values i have made this so as to calcualte each at once and compare sideways

'''

# defining the models in dictionary
models={
    "randomForest": RandomForestRegressor(n_estimators=100 ,random_state=42), # n_estimator = number of trees as random forest works on different tree so this helps in selecting the number
    "gradientBoosting": GradientBoostingRegressor(n_estimators=100, random_state=42),

    # here we use verbosity as the xgboost is made up with c++ and python for the fast performance and this might be giving log message so using verbosity gives us contool of how much the model talks with us
    "xgboost":XGBRegressor(n_estimators=100, random_state=42,verbosity=0)

}

# using grid search and hyperparamters  keys are the correct hyperparameter name
hp={
    "randomForest":{
        "n_estimators":[100,200],
        "max_depth":[10,20], # maximum depth of the tree
    },
    "gradientBoosting":{
        'n_estimators': [100, 200],
        'learning_rate': [0.05, 0.1], # as this learns fromm the past mistakes continuously

    # learning rate is low as i dont want to jump directly to the top and miss out the middle values so to improve more accuracy steady it is low
    },
    "xgboost": {
        'n_estimators': [100, 200],
        'learning_rate': [0.05, 0.1],
    }
    }

results=[] # empty list to store result of each model


# now train predict in this in a loop for each model
for i,model in models.items():
    '''
    this line gives the key value pair i.e. .items() gives 
    i=key
    and model= value
    
 this below line is for the fixed set of estimators and training and testing size 
    model.fit(x_train,y_train)
    y_pred=model.predict(x_test)

    but for lettinf the user have different estimators and sets we use hyperparamters i.e. different numbers of various parameters
    and to work on those values, model should be trained on those datas which is supported by grid search cv
'''

# hp[i] gives the list of hyperparameters for the values of key
# cv is cross valdiation in which the whole data sets is divided into 8 parts and each part is divided as 7 train and 1 test and performed 8 times for stablility


    grid=GridSearchCV(model,hp[i], cv=3 , scoring='r2', n_jobs=-1) # can use n_jobs (much needed when dealing with many models in loop) which helps in parallelization of cpu i.e. -1 means uses all 4 cores , 1 means only 1 core and likewise
    grid.fit(x_train,y_train)
     # these 2 lines above now has the output as:  grid.best_params_ , grid.best_score_ , grid_best_estimator_

    modelbest= grid.best_estimator_
    y_pred=modelbest.predict(x_test)

    MAE= mean_absolute_error(y_test,y_pred)
    MSE=mean_squared_error(y_test,y_pred)
    RMSE=np.sqrt(MSE)
    r2_Score= r2_score(y_test,y_pred)

    # i am trying to make the table so as to ahow the values directly 

    results.append({
        "model": i,
        "mae":MAE,
        "mse":MSE,
        "rmse": RMSE,
        "r2 score": r2_Score

    })
resultsdf=pd.DataFrame(results)
print(resultsdf.sort_values(by='r2 score', ascending=False))
