### RandomForest

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

In [2]:
df = pd.read_csv('video_data_top10_channels.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,video_id,channelTitle,title,description,tags,publishedAt,viewCount,likeCount,favouriteCount,commentCount,duration,definition,caption
0,0,1KEbiqRWOkA,Alex The Analyst,7 Mistakes to Avoid During Your Data Analyst J...,When I was a Hiring Managers I saw a lot of pe...,"['Data Analyst', 'Data Analyst job', 'Data Ana...",2024-03-05T13:00:01Z,8218,382.0,,40.0,PT11M54S,hd,False
1,1,UOBTLzWY1vs,Alex The Analyst,#DataAnalyst #AnalystBuilder #SQL,Link: AnalystBuilder.com,,2024-03-01T13:43:29Z,5703,401.0,,13.0,PT38S,hd,False
2,2,8zOkBTs0yxs,Alex The Analyst,Q/A Livestream | February Livestream | Ask Me ...,This is February's Livestream where you can co...,"['Data Analyst', 'Data Analyst job', 'Data Ana...",2024-02-29T17:10:18Z,4536,181.0,,6.0,PT1H34M57S,hd,False
3,3,7NBt0V8ebGk,Alex The Analyst,Window Functions in MySQL | Intermediate MySQL,Full MySQL Course: https://www.analystbuilder....,"['Data Analyst', 'Data Analyst job', 'Data Ana...",2024-02-27T13:00:08Z,5471,183.0,,13.0,PT13M29S,hd,False
4,4,FGC0cCAgGu0,Alex The Analyst,Twitter making me tear up over here ðŸ¥¹,,,2024-02-23T13:34:27Z,3988,213.0,,11.0,PT16S,hd,False


In [4]:
mapping = {'hd': 1, 'sd': 0}
df['definition'] = df['definition'].replace(mapping)

In [5]:
# Convert categorical variables to numerical format
le = LabelEncoder()
df['title'] = le.fit_transform(df['title'])
df['tags'] = le.fit_transform(df['tags'])
df['description'] = le.fit_transform(df['description'])

In [6]:
df.head(5)

Unnamed: 0.1,Unnamed: 0,video_id,channelTitle,title,description,tags,publishedAt,viewCount,likeCount,favouriteCount,commentCount,duration,definition,caption
0,0,1KEbiqRWOkA,Alex The Analyst,111,3983,257,2024-03-05T13:00:01Z,8218,382.0,,40.0,PT11M54S,1,False
1,1,UOBTLzWY1vs,Alex The Analyst,6,2319,3601,2024-03-01T13:43:29Z,5703,401.0,,13.0,PT38S,1,False
2,2,8zOkBTs0yxs,Alex The Analyst,3401,3276,204,2024-02-29T17:10:18Z,4536,181.0,,6.0,PT1H34M57S,1,False
3,3,7NBt0V8ebGk,Alex The Analyst,4774,670,257,2024-02-27T13:00:08Z,5471,183.0,,13.0,PT13M29S,1,False
4,4,FGC0cCAgGu0,Alex The Analyst,4447,4330,3601,2024-02-23T13:34:27Z,3988,213.0,,11.0,PT16S,1,False


In [7]:
count_likes_greater_than_10000 = df[df['likeCount'] > 10000]['likeCount'].count()

In [8]:
count_likes_greater_than_10000

200

In [9]:
df['duration'].dtype

dtype('O')

In [10]:
# Function to convert duration string to seconds
def duration_to_seconds(duration_str):
    # Remove 'PT' from the beginning
    duration_str = duration_str[2:]
    
    # Initialize total seconds
    total_seconds = 0
    
    # Check if 'H' (hours) is present
    if 'H' in duration_str:
        hours, duration_str = duration_str.split('H')
        total_seconds += int(hours) * 3600
    
    # Check if 'M' (minutes) is present
    if 'M' in duration_str:
        minutes, duration_str = duration_str.split('M')
        total_seconds += int(minutes) * 60
    
    # Check if 'S' (seconds) is present
    if 'S' in duration_str:
        seconds = duration_str.replace('S', '')
        total_seconds += int(seconds)
    
    return total_seconds

# Apply the function to convert duration column to seconds
df['duration_seconds'] = df['duration'].apply(duration_to_seconds)

print(df)

      Unnamed: 0     video_id      channelTitle  title  description  tags  \
0              0  1KEbiqRWOkA  Alex The Analyst    111         3983   257   
1              1  UOBTLzWY1vs  Alex The Analyst      6         2319  3601   
2              2  8zOkBTs0yxs  Alex The Analyst   3401         3276   204   
3              3  7NBt0V8ebGk  Alex The Analyst   4774          670   257   
4              4  FGC0cCAgGu0  Alex The Analyst   4447         4330  3601   
...          ...          ...               ...    ...          ...   ...   
4928        4928  MGD_b2w_GU4           sentdex   1844         2904  2662   
4929        4929  OyZkXsgv5qk           sentdex   3358         2905  2455   
4930        4930  8PzDfykGg_g           sentdex   3185         2901  2860   
4931        4931  wAwQ-noyB98           sentdex   2608         2879  2859   
4932        4932  gJNaiIYodZg           sentdex    131          573   674   

               publishedAt  viewCount  likeCount  favouriteCount  \
0     2

In [11]:
df['duration_seconds'].dtype

dtype('int64')

In [12]:
print(df['duration_seconds'])

0        714
1         38
2       5697
3        809
4         16
        ... 
4928     446
4929     859
4930    1265
4931     661
4932     137
Name: duration_seconds, Length: 4933, dtype: int64


In [13]:
df.head(5)

Unnamed: 0.1,Unnamed: 0,video_id,channelTitle,title,description,tags,publishedAt,viewCount,likeCount,favouriteCount,commentCount,duration,definition,caption,duration_seconds
0,0,1KEbiqRWOkA,Alex The Analyst,111,3983,257,2024-03-05T13:00:01Z,8218,382.0,,40.0,PT11M54S,1,False,714
1,1,UOBTLzWY1vs,Alex The Analyst,6,2319,3601,2024-03-01T13:43:29Z,5703,401.0,,13.0,PT38S,1,False,38
2,2,8zOkBTs0yxs,Alex The Analyst,3401,3276,204,2024-02-29T17:10:18Z,4536,181.0,,6.0,PT1H34M57S,1,False,5697
3,3,7NBt0V8ebGk,Alex The Analyst,4774,670,257,2024-02-27T13:00:08Z,5471,183.0,,13.0,PT13M29S,1,False,809
4,4,FGC0cCAgGu0,Alex The Analyst,4447,4330,3601,2024-02-23T13:34:27Z,3988,213.0,,11.0,PT16S,1,False,16


In [14]:
df['publishedAt'].dtype

dtype('O')

In [15]:
# Convert 'publishedAt' column to datetime type and set timezone to UTC
df['datetime'] = pd.to_datetime(df['publishedAt']).dt.tz_convert('UTC')

# Convert datetime objects to Unix timestamps (integers)
df['timestamp'] = (df['datetime'] - pd.Timestamp("1970-01-01", tz='UTC')) // pd.Timedelta(seconds=1)


The above code will add a new column named 'timestamp' to the DataFrame, containing the Unix timestamps (integers) corresponding to the datetime strings in the 'publishedAt' column.

In [16]:
df['publishedAt'].dtype

dtype('O')

In [17]:
df['timestamp'].dtype

dtype('int64')

In [18]:
df['timestamp']

0       1709643601
1       1709300609
2       1709226618
3       1709038808
4       1708695267
           ...    
4928    1370876245
4929    1370726011
4930    1369418743
4931    1368739492
4932    1368212744
Name: timestamp, Length: 4933, dtype: int64

In [19]:
df.head(5)

Unnamed: 0.1,Unnamed: 0,video_id,channelTitle,title,description,tags,publishedAt,viewCount,likeCount,favouriteCount,commentCount,duration,definition,caption,duration_seconds,datetime,timestamp
0,0,1KEbiqRWOkA,Alex The Analyst,111,3983,257,2024-03-05T13:00:01Z,8218,382.0,,40.0,PT11M54S,1,False,714,2024-03-05 13:00:01+00:00,1709643601
1,1,UOBTLzWY1vs,Alex The Analyst,6,2319,3601,2024-03-01T13:43:29Z,5703,401.0,,13.0,PT38S,1,False,38,2024-03-01 13:43:29+00:00,1709300609
2,2,8zOkBTs0yxs,Alex The Analyst,3401,3276,204,2024-02-29T17:10:18Z,4536,181.0,,6.0,PT1H34M57S,1,False,5697,2024-02-29 17:10:18+00:00,1709226618
3,3,7NBt0V8ebGk,Alex The Analyst,4774,670,257,2024-02-27T13:00:08Z,5471,183.0,,13.0,PT13M29S,1,False,809,2024-02-27 13:00:08+00:00,1709038808
4,4,FGC0cCAgGu0,Alex The Analyst,4447,4330,3601,2024-02-23T13:34:27Z,3988,213.0,,11.0,PT16S,1,False,16,2024-02-23 13:34:27+00:00,1708695267


In [20]:
df['trending_rows'] = (df['viewCount'] > 1000) & (df['likeCount'] > 1000)

we create a trending column which contain value equals to 1 for having viewCount as well as likeCount greater than 1000

In [21]:
# mapping true and false value to 0 and 1
df['trending_rows'] = df['trending_rows'].map({True: 1, False: 0})

In [22]:
## we check for the rows having viewCount as well as likeCount greater than 1000
trend = df[df['trending_rows'] == 1]

In [23]:
trend.shape

(1705, 18)

we have 1705 such(viewCount & likeCount greater than 1000) rows

In [24]:
df['trending_rows'] 

0       0
1       0
2       0
3       0
4       0
       ..
4928    0
4929    0
4930    0
4931    0
4932    0
Name: trending_rows, Length: 4933, dtype: int64

In [25]:
df.head(5)

Unnamed: 0.1,Unnamed: 0,video_id,channelTitle,title,description,tags,publishedAt,viewCount,likeCount,favouriteCount,commentCount,duration,definition,caption,duration_seconds,datetime,timestamp,trending_rows
0,0,1KEbiqRWOkA,Alex The Analyst,111,3983,257,2024-03-05T13:00:01Z,8218,382.0,,40.0,PT11M54S,1,False,714,2024-03-05 13:00:01+00:00,1709643601,0
1,1,UOBTLzWY1vs,Alex The Analyst,6,2319,3601,2024-03-01T13:43:29Z,5703,401.0,,13.0,PT38S,1,False,38,2024-03-01 13:43:29+00:00,1709300609,0
2,2,8zOkBTs0yxs,Alex The Analyst,3401,3276,204,2024-02-29T17:10:18Z,4536,181.0,,6.0,PT1H34M57S,1,False,5697,2024-02-29 17:10:18+00:00,1709226618,0
3,3,7NBt0V8ebGk,Alex The Analyst,4774,670,257,2024-02-27T13:00:08Z,5471,183.0,,13.0,PT13M29S,1,False,809,2024-02-27 13:00:08+00:00,1709038808,0
4,4,FGC0cCAgGu0,Alex The Analyst,4447,4330,3601,2024-02-23T13:34:27Z,3988,213.0,,11.0,PT16S,1,False,16,2024-02-23 13:34:27+00:00,1708695267,0


In [26]:
#Train Test Split
# Select features for training the model
features = ['title', 'tags', 'timestamp', 'duration_seconds']
X = df[features]
y = df['trending_rows']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
     

In [30]:
#Model Selection
# Build a Random Forest Classifier model - Ensemble Learning /Highly Accurate /Reduced Overfitting /Feature Importance /Feature Importance
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Display classification report
print(classification_report(y_test, y_pred))

Accuracy: 0.77
              precision    recall  f1-score   support

           0       0.80      0.87      0.83       644
           1       0.70      0.58      0.64       343

    accuracy                           0.77       987
   macro avg       0.75      0.73      0.73       987
weighted avg       0.76      0.77      0.76       987



 import joblib

#### Save the model to a file
joblib.dump(model, 'random_forest_classifier_model.pkl')

#### Load the model from the file
loaded_model = joblib.load('random_forest_classifier_model.pkl')


from sklearn.metrics import accuracy_score, classification_report

#### Load the model from the file
loaded_model = joblib.load('random_forest_classifier_model.pkl')

#### Make predictions on the test set
y_pred = loaded_model.predict(X_test)

#### Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

#### Display classification report
print(classification_report(y_test, y_pred))

In [52]:
import pickle 
#save the model 
with open('random_forest_classi_model.pkl','wb') as file:
    pickle.dump(model,file)

In [53]:
# Load the model from the file
with open('random_forest_classi_model.pkl', 'rb') as file:
    loaded_model = pickle.load(file)

In [54]:
# Make predictions on the test set
y_pred = loaded_model.predict(X_test)

In [56]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Display classification report
print(classification_report(y_test, y_pred))

Accuracy: 0.77
              precision    recall  f1-score   support

           0       0.80      0.87      0.83       644
           1       0.70      0.58      0.64       343

    accuracy                           0.77       987
   macro avg       0.75      0.73      0.73       987
weighted avg       0.76      0.77      0.76       987

