#Import the Dataset and Libraries


In [1]:
import pandas as pd

# Load the dataset
data = pd.read_csv('train.csv')

# Check the shape and data types
print(data.shape)
print(data.dtypes)

(14999, 9)
vidid        object
adview        int64
views        object
likes        object
dislikes     object
comment      object
published    object
duration     object
category     object
dtype: object


In [2]:
data.head()

Unnamed: 0,vidid,adview,views,likes,dislikes,comment,published,duration,category
0,VID_18655,40,1031602,8523,363,1095,2016-09-14,PT7M37S,F
1,VID_14135,2,1707,56,2,6,2016-10-01,PT9M30S,D
2,VID_2187,1,2023,25,0,2,2016-07-02,PT2M16S,C
3,VID_23096,6,620860,777,161,153,2016-07-27,PT4M22S,H
4,VID_10175,1,666,1,0,0,2016-06-29,PT31S,D


#Data Exploration and Visualization


###Check for Missing Values:


In [3]:
print(data.isnull().sum())


vidid        0
adview       0
views        0
likes        0
dislikes     0
comment      0
published    0
duration     0
category     0
dtype: int64


###Visualize Distributions: Use seaborn and matplotlib to visualize distributions and relationships between features

In [1]:
import seaborn as sns
import matplotlib.pyplot as plt

# Plot distribution of adviews
sns.histplot(data['adview'], kde=True)
plt.show()

# Heatmap to check correlations
corr_matrix = data.corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.show()

NameError: name 'data' is not defined

#Data Cleaning

###Handling Missing Values

In [4]:
# Example: Drop rows with missing values
data = data.dropna()


###Data Type Conversion:
Convert published date to datetime format.
Convert duration to numerical format (e.g., total seconds).

In [5]:
data['published'] = pd.to_datetime(data['published'], format='%Y-%m-%d')

# Function to convert duration to seconds
import re
def duration_to_seconds(duration):
    match = re.match('PT(\d+H)?(\d+M)?(\d+S)?', duration)
    hours = int(match.group(1)[:-1]) if match.group(1) else 0
    minutes = int(match.group(2)[:-1]) if match.group(2) else 0
    seconds = int(match.group(3)[:-1]) if match.group(3) else 0
    return hours * 3600 + minutes * 60 + seconds

data['duration'] = data['duration'].apply(duration_to_seconds)


In [6]:
data.head()

Unnamed: 0,vidid,adview,views,likes,dislikes,comment,published,duration,category
0,VID_18655,40,1031602,8523,363,1095,2016-09-14,457,F
1,VID_14135,2,1707,56,2,6,2016-10-01,570,D
2,VID_2187,1,2023,25,0,2,2016-07-02,136,C
3,VID_23096,6,620860,777,161,153,2016-07-27,262,H
4,VID_10175,1,666,1,0,0,2016-06-29,31,D


In [7]:
data['year'] = data['published'].dt.year
data['month'] = data['published'].dt.month
data['day'] = data['published'].dt.day


#Feature Transformation

In [8]:
# Drop unnecessary columns
data = data.drop(columns=['vidid', 'published'])

###One-Hot Encoding for category

In [9]:
data = pd.get_dummies(data, columns=['category'], drop_first=True)


###Extracting Features from published

In [10]:
data.head()

Unnamed: 0,adview,views,likes,dislikes,comment,duration,year,month,day,category_B,category_C,category_D,category_E,category_F,category_G,category_H
0,40,1031602,8523,363,1095,457,2016,9,14,False,False,False,False,True,False,False
1,2,1707,56,2,6,570,2016,10,1,False,False,True,False,False,False,False
2,1,2023,25,0,2,136,2016,7,2,False,True,False,False,False,False,False
3,6,620860,777,161,153,262,2016,7,27,False,False,False,False,False,False,True
4,1,666,1,0,0,31,2016,6,29,False,False,True,False,False,False,False


In [14]:
# Convert object columns to numeric, forcing any errors to NaN
data['views'] = pd.to_numeric(data['views'], errors='coerce')
data['likes'] = pd.to_numeric(data['likes'], errors='coerce')
data['dislikes'] = pd.to_numeric(data['dislikes'], errors='coerce')
data['comment'] = pd.to_numeric(data['comment'], errors='coerce')

# Check for NaN values in the converted columns
print(data[['views', 'likes', 'dislikes', 'comment']].isna().sum())

# Handle NaN values (e.g., fill with 0 or mean, or drop)
data = data.fillna(0)  # Option 1: Fill NaNs with 0
# OR
# data = data.dropna()  # Option 2: Drop rows with NaNs

# Verify that the columns are now numeric
print(data.dtypes)

# Proceed with the rest of your data processing, scaling, and model training


views         2
likes       155
dislikes    155
comment     233
dtype: int64
adview          int64
views         float64
likes         float64
dislikes      float64
comment       float64
duration        int64
year            int32
month           int32
day             int32
category_B       bool
category_C       bool
category_D       bool
category_E       bool
category_F       bool
category_G       bool
category_H       bool
dtype: object


In [15]:
# Split data into features (X) and target (y)
X = data.drop(columns=['adview'])
y = data['adview']

#Normalization and Data Splitting

###Normalize Features

In [17]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaled_data = scaler.fit_transform(X)


###Split Data:

In [18]:
from sklearn.model_selection import train_test_split

X = scaled_data
y = data['adview']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


#Model Training and Evaluation

###Linear Regression Example

In [23]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

lin_model = LinearRegression()
lin_model.fit(X_train, y_train)
y_pred_lin = lin_model.predict(X_test)
print("Linear Regression MSE:", mean_squared_error(y_test, y_pred_lin))
print("Linear Regression R2:", r2_score(y_test, y_pred_lin))


Linear Regression MSE: 10741158967.71663
Linear Regression R2: 0.0016635261346891195


### Support Vector Machine

In [24]:
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score

# Train and Evaluate SVR
svr_model = SVR(kernel='rbf', C=100, gamma=0.1, epsilon=.1)
svr_model.fit(X_train, y_train)
y_pred_svr = svr_model.predict(X_test)
print("SVR MSE:", mean_squared_error(y_test, y_pred_svr))
print("SVR R2:", r2_score(y_test, y_pred_svr))

SVR MSE: 10771326740.265295
SVR R2: -0.0011404159502503841


### Decision Tree

In [21]:
from sklearn.tree import DecisionTreeRegressor


# Train and Evaluate Decision Tree Regressor
dt_model = DecisionTreeRegressor(random_state=42)
dt_model.fit(X_train, y_train)
y_pred_dt = dt_model.predict(X_test)
print("Decision Tree MSE:", mean_squared_error(y_test, y_pred_dt))
print("Decision Tree R2:", r2_score(y_test, y_pred_dt))

Decision Tree MSE: 10965367023.793667
Decision Tree R2: -0.01917548023221194


### Random Forest

In [25]:
from sklearn.ensemble import RandomForestRegressor

# Train and Evaluate Random Forest Regressor
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
print("Random Forest MSE:", mean_squared_error(y_test, y_pred_rf))
print("Random Forest R2:", r2_score(y_test, y_pred_rf))

Random Forest MSE: 10422558832.087505
Random Forest R2: 0.031275799533953985


#ANN Model Building

###Build and Train an ANN

In [26]:
from keras.models import Sequential
from keras.layers import Dense

ann_model = Sequential()
ann_model.add(Dense(128, input_dim=X_train.shape[1], activation='relu'))
ann_model.add(Dense(64, activation='relu'))
ann_model.add(Dense(1, activation='linear'))

ann_model.compile(optimizer='adam', loss='mean_squared_error')
ann_model.fit(X_train, y_train, epochs=50, batch_size=10, validation_data=(X_test, y_test))


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/50
[1m1200/1200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - loss: 577228352.0000 - val_loss: 10767430656.0000
Epoch 2/50
[1m1200/1200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - loss: 655336960.0000 - val_loss: 10761761792.0000
Epoch 3/50
[1m1200/1200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - loss: 673725376.0000 - val_loss: 10757214208.0000
Epoch 4/50
[1m1200/1200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - loss: 545474688.0000 - val_loss: 10754313216.0000
Epoch 5/50
[1m1200/1200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - loss: 809443776.0000 - val_loss: 10752569344.0000
Epoch 6/50
[1m1200/1200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - loss: 906421248.0000 - val_loss: 10749845504.0000
Epoch 7/50
[1m1200/1200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - loss: 764921472.0000 - val_loss: 10748427264.0000
Epoch 8/50
[1m1200/

<keras.src.callbacks.history.History at 0x798e776d9330>

#Model Selection and Prediction

###Save the Best Model

In [27]:
import joblib
joblib.dump(rf_model, 'best_model.pkl')


['best_model.pkl']

###Predict on Test Set

In [29]:
y_pred = rf_model.predict(X_test)
y_pred

array([1.95600e+01, 5.08990e+02, 4.76000e+00, ..., 9.98700e+03,
       1.04698e+03, 5.28000e+01])

In [30]:
y_pred[:10]

array([1.956000e+01, 5.089900e+02, 4.760000e+00, 5.840000e+00,
       1.123812e+04, 5.235980e+03, 7.750000e+00, 5.330000e+00,
       1.468950e+03, 1.403000e+01])