In [None]:
import pandas as pd
df = pd.read_csv('netflix_titles.csv')
print(df.head())

  show_id     type                  title         director  \
0      s1    Movie   Dick Johnson Is Dead  Kirsten Johnson   
1      s2  TV Show          Blood & Water              NaN   
2      s3  TV Show              Ganglands  Julien Leclercq   
3      s4  TV Show  Jailbirds New Orleans              NaN   
4      s5  TV Show           Kota Factory              NaN   

                                                cast        country  \
0                                                NaN  United States   
1  Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...   South Africa   
2  Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...            NaN   
3                                                NaN            NaN   
4  Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...          India   

           date_added  release_year rating   duration  \
0  September 25, 2021          2020  PG-13     90 min   
1  September 24, 2021          2021  TV-MA  2 Seasons   
2  September 24, 2021        

In [None]:
print("Tail of the dataset:")
print(df.tail())

Tail of the dataset:
     show_id     type        title         director  \
8802   s8803    Movie       Zodiac    David Fincher   
8803   s8804  TV Show  Zombie Dumb              NaN   
8804   s8805    Movie   Zombieland  Ruben Fleischer   
8805   s8806    Movie         Zoom     Peter Hewitt   
8806   s8807    Movie       Zubaan      Mozez Singh   

                                                   cast        country  \
8802  Mark Ruffalo, Jake Gyllenhaal, Robert Downey J...  United States   
8803                                                NaN            NaN   
8804  Jesse Eisenberg, Woody Harrelson, Emma Stone, ...  United States   
8805  Tim Allen, Courteney Cox, Chevy Chase, Kate Ma...  United States   
8806  Vicky Kaushal, Sarah-Jane Dias, Raaghav Chanan...          India   

             date_added  release_year rating   duration  \
8802  November 20, 2019          2007      R    158 min   
8803       July 1, 2019          2018  TV-Y7  2 Seasons   
8804   November 1, 2019   

In [None]:
print("Column Names and Data Types:")
print(df.dtypes)

Column Names and Data Types:
show_id         object
type            object
title           object
director        object
cast            object
country         object
date_added      object
release_year     int64
rating          object
duration        object
listed_in       object
description     object
dtype: object


In [None]:
print("Shape of the dataset:")
print(df.shape)

Shape of the dataset:
(8807, 12)


In [None]:
print(df.describe())

       release_year
count   8807.000000
mean    2014.180198
std        8.819312
min     1925.000000
25%     2013.000000
50%     2017.000000
75%     2019.000000
max     2021.000000


In [None]:
print("Missing values in each column:")
print(df.isnull().sum())

Missing values in each column:
show_id            0
type               0
title              0
director        2634
cast             825
country          831
date_added        10
release_year       0
rating             4
duration           3
listed_in          0
description        0
dtype: int64


In [None]:
#CONTENT BASED FILTERING

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
df['description'] = df['description'].fillna('')
df['cast'] = df['cast'].fillna('')
df['listed_in'] = df['listed_in'].fillna('')
df['content'] = df['description'] + ' ' + df['cast'] + ' ' + df['listed_in']
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['content'])
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [None]:
indices = pd.Series(df.index, index=df['title']).drop_duplicates()
def get_recommendations(title, cosine_sim=cosine_sim):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    title_indices = [i[0] for i in sim_scores]
    return df['title'].iloc[title_indices]
print(get_recommendations('Avengers: Infinity War'))

8580                       Thor: Ragnarok
7405     Mark Gatiss: A Study in Sherlock
6325                        Black Panther
6958                                  Her
8696                            War Horse
8395                    The Little Prince
1406    Penguins of Madagascar: The Movie
969                  August: Osage County
7286                               Legion
3667             3Below: Tales of Arcadia
Name: title, dtype: object


In [None]:
#COLLABORATIVE FILTERING

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
user_profiles = {
    'User1': ['Action', 'Adventure'],
    'User2': ['Drama', 'Romantic'],
    'User3': ['Comedy', 'Family'],
}
df['genres_directors'] = df['listed_in'] + ' ' + df['director'].fillna('')
count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(df['genres_directors'])
cosine_sim = cosine_similarity(count_matrix, count_matrix)
def get_user_recommendations(user_id, user_profiles, cosine_sim=cosine_sim):
    user_query = ' '.join(user_profiles[user_id])
    user_vec = count.transform([user_query])
    sim_scores = cosine_similarity(user_vec, count_matrix)
    sim_scores = sim_scores.flatten()
    movie_indices = sim_scores.argsort()[-10:][::-1]
    return df['title'].iloc[movie_indices]
print(get_user_recommendations('User1', user_profiles))

4028    Into the Badlands
4024      Triple Frontier
1608       3 Days to Kill
1620           Peppermint
1135               Takers
1124         Maximum Risk
7198    Kill Bill: Vol. 2
8694                  War
437            Cosmic Sin
6427        Casino Royale
Name: title, dtype: object


In [None]:
#HYBRID FILTERING

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
df['description'] = df['description'].fillna('')
df['cast'] = df['cast'].fillna('')
df['listed_in'] = df['listed_in'].fillna('')
df['content'] = df['description'] + ' ' + df['cast'] + ' ' + df['listed_in']
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['content'])
cosine_sim_content = linear_kernel(tfidf_matrix, tfidf_matrix)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
df['genres_directors'] = df['listed_in'] + ' ' + df['director'].fillna('')
count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(df['genres_directors'])
cosine_sim_collab = cosine_similarity(count_matrix, count_matrix)

In [None]:
def hybrid_recommendations(title, content_sim=cosine_sim_content, collab_sim=cosine_sim_collab, alpha=0.5):
    idx = indices[title]
    content_scores = list(enumerate(content_sim[idx]))
    collab_scores = list(enumerate(collab_sim[idx]))
    combined_scores = [(i, alpha * content_scores[i][1] + (1 - alpha) * collab_scores[i][1]) for i in range(len(df))]
    combined_scores = sorted(combined_scores, key=lambda x: x[1], reverse=True)
    combined_scores = combined_scores[1:11]  # Exclude the first item (itself)
    title_indices = [i[0] for i in combined_scores]
    return df['title'].iloc[title_indices]
indices = pd.Series(df.index, index=df['title']).drop_duplicates()
print(hybrid_recommendations('Sherlock Holmes'))

7326    Lock, Stock and Two Smoking Barrels
176                        The Golden Child
566                        Charlie's Angels
6447        Charlie's Angels: Full Throttle
930                                Due Date
975        Shadow and Bone - The Afterparty
8032                              Skiptrace
1626                  The Happytime Murders
6343                            Blue Streak
8284                   The Dukes of Hazzard
Name: title, dtype: object


In [None]:
df['description'] = df['description'].fillna('')
df['cast'] = df['cast'].fillna('')
df['listed_in'] = df['listed_in'].fillna('')
df['director'] = df['director'].fillna('')

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
df = pd.read_csv('netflix_titles.csv')
df_ratings = df[['show_id', 'rating']].dropna()
df_ratings['item_id'] = df_ratings['show_id'].astype('category').cat.codes
df_ratings['user_id'] = df_ratings.groupby('show_id').ngroup()
X = df_ratings[['user_id', 'item_id']]
y = df_ratings['rating']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
rating_mapping = {
    'TV-MA': 1,
    'TV-14': 2,
    'TV-PG': 3,
    'TV-G': 4,
    'NR': 5
}
df['rating_numeric'] = df['rating'].map(rating_mapping)
df_ratings = df[['show_id', 'rating_numeric']].dropna()
df_ratings['item_id'] = df_ratings['show_id'].astype('category').cat.codes
df_ratings['user_id'] = df_ratings.groupby('show_id').ngroup()
X = df_ratings[['user_id', 'item_id']]
y = df_ratings['rating_numeric']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np
model = DecisionTreeRegressor()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
print(f"Decision Tree Regressor MSE: {mse}")
print(f"Decision Tree Regressor MAE: {mae}")
print(f"Decision Tree Regressor RMSE: {rmse}")
print(f"Decision Tree Regressor R-squared: {r2}")


Decision Tree Regressor MSE: 1.4280245022970903
Decision Tree Regressor MAE: 0.821592649310873
Decision Tree Regressor RMSE: 1.1949997917560866
Decision Tree Regressor R-squared: -0.7464788126447963


In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_y_pred = rf_model.predict(X_test)
rf_mse = mean_squared_error(y_test, rf_y_pred)
print(f"Random Forest Regressor MSE: {rf_mse}")

Random Forest Regressor MSE: 1.0698126339969372


In [None]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
knn_model = KNeighborsRegressor(n_neighbors=5)
knn_model.fit(X_train, y_train)
knn_y_pred = knn_model.predict(X_test)
knn_mse = mean_squared_error(y_test, knn_y_pred)
print(f"K-Nearest Neighbors Regressor MSE: {knn_mse}")

K-Nearest Neighbors Regressor MSE: 0.9071362940275651


In [None]:
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error
mlp_model = MLPRegressor(hidden_layer_sizes=(10,), max_iter=1000, random_state=42)
mlp_model.fit(X_train, y_train)
mlp_y_pred = mlp_model.predict(X_test)
mlp_mse = mean_squared_error(y_test, mlp_y_pred)
print(f"Neural Network Regressor MSE: {mlp_mse}")

Neural Network Regressor MSE: 1.1177458256175132


In [None]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
gbr_model = GradientBoostingRegressor(n_estimators=100, random_state=42)
gbr_model.fit(X_train, y_train)
gbr_y_pred = gbr_model.predict(X_test)
gbr_mse = mean_squared_error(y_test, gbr_y_pred)
print(f"Gradient Boosting Regressor MSE: {gbr_mse}")

Gradient Boosting Regressor MSE: 0.7868381955375192


In [None]:
import pandas as pd
from datetime import datetime
import numpy as np
data = {
    'user_id': [1, 2, 1, 3],
    'item_id': [101, 102, 103, 101],
    'rating': [5, 4, 3, 5],
    'timestamp': [datetime.now(), datetime.now(), datetime.now(), datetime.now()],
    'location': ['New York', 'Los Angeles', 'New York', 'Chicago'],
    'device': ['mobile', 'desktop', 'mobile', 'tablet']
}
df = pd.DataFrame(data)
def recommend_items(user_id, current_time, location, device):
    user_data = df[(df['user_id'] == user_id) &
                    (df['timestamp'] <= current_time) &
                    (df['location'] == location) &
                    (df['device'] == device)]
    recommendations = user_data.groupby('item_id').agg({'rating': 'mean'}).reset_index()
    recommendations = recommendations.sort_values(by='rating', ascending=False)
    return recommendations['item_id'].tolist()
user_id = 1
current_time = datetime.now()
location = 'New York'
device = 'mobile'
print(f"Recommended items: {recommend_items(user_id, current_time, location, device)}")


Recommended items: [101, 103]


In [None]:
#SECURITY AND PRIVACY

In [None]:
from cryptography.fernet import Fernet

# Generate a key for encryption
key = Fernet.generate_key()
cipher_suite = Fernet(key)

# Encrypt the user_id, location, and device columns
df['encrypted_user_id'] = df['user_id'].apply(lambda x: cipher_suite.encrypt(str(x).encode('utf-8')))
df['encrypted_location'] = df['location'].apply(lambda x: cipher_suite.encrypt(str(x).encode('utf-8')))
df['encrypted_device'] = df['device'].apply(lambda x: cipher_suite.encrypt(str(x).encode('utf-8')))

In [None]:
import hashlib

# Hash the user_id to anonymize it
df['hashed_user_id'] = df['user_id'].apply(lambda x: hashlib.sha256(str(x).encode('utf-8')).hexdigest())

In [None]:
df = df.drop(columns=['user_id', 'location', 'device'])

In [None]:
#After encryption this is how it looks because its encrypted
print(df.head())

   item_id  rating                  timestamp  \
0      101       5 2024-08-23 15:30:50.081306   
1      102       4 2024-08-23 15:30:50.081309   
2      103       3 2024-08-23 15:30:50.081310   
3      101       5 2024-08-23 15:30:50.081310   

                                   encrypted_user_id  \
0  b'gAAAAABmyKsqRN-mzlXEhhcVzPtJVKfgBLgtJ8e16cS6...   
1  b'gAAAAABmyKsqktYUz-M2KBEhbhv4t27wvcj4_eDcte_A...   
2  b'gAAAAABmyKsqWLZzhFL4mXqPx0WJMLXZXmZwXJDrLWp8...   
3  b'gAAAAABmyKsqsvtyorpECUviNhOkA0OcOiu6tPfZ8mZ5...   

                                  encrypted_location  \
0  b'gAAAAABmyKsqJuhM_t3YK44ZnqLvZAnOYk9E7Der-zoI...   
1  b'gAAAAABmyKsq5z9X9LB4rksgIPJWPryhG9bjqPApepnQ...   
2  b'gAAAAABmyKsqz58fnP2tXPbcStoV2Wz_OIlzqzjVk69q...   
3  b'gAAAAABmyKsqNKx-4-UHFVTgYz44wJbvPFNHrPrnjSdW...   

                                    encrypted_device  \
0  b'gAAAAABmyKsqrsNABGgf7ZsY9z4X66rY8okE96JlTw7R...   
1  b'gAAAAABmyKsqIq-qje_xKZ8q7u4j4GTclGmYZazb9Tfq...   
2  b'gAAAAABmyKsqiGhcYIK

In [None]:
# Example roles
roles = {
    'admin': {'access_level': 'full'},
    'user': {'access_level': 'read_only'}
}

# Function to check access level
def check_access(role, operation):
    if roles[role]['access_level'] == 'full' or (roles[role]['access_level'] == 'read_only' and operation == 'read'):
        print(f"{role} has permission to perform {operation}.")
    else:
        print(f"{role} does NOT have permission to perform {operation}.")

# Example usage
check_access('admin', 'read')
check_access('admin', 'write')
check_access('user', 'read')
check_access('user', 'write')  # This should be denied


admin has permission to perform read.
admin has permission to perform write.
user has permission to perform read.
user does NOT have permission to perform write.


In [None]:
# Implement access control checks before any operation
role = 'user'  # Example role, this could come from a user session
operation = 'read'  # Could be 'read', 'write', etc.

# Check access
check_access(role, operation)

# If the role has access, continue with operations; otherwise, deny access
if roles[role]['access_level'] == 'full' or (roles[role]['access_level'] == 'read_only' and operation == 'read'):
    print("Operation allowed.")
    # Continue with secure operations here
else:
    print("Operation denied due to insufficient permissions.")

user has permission to perform read.
Operation allowed.


In [None]:
#REVIEW-1 FINISHED

In [None]:
pip install flask_login

Collecting flask_login
  Downloading Flask_Login-0.6.3-py3-none-any.whl.metadata (5.8 kB)
Downloading Flask_Login-0.6.3-py3-none-any.whl (17 kB)
Installing collected packages: flask_login
Successfully installed flask_login-0.6.3


In [None]:
from flask import Flask, redirect, url_for, request
from flask_login import LoginManager, UserMixin, login_required, login_user, logout_user

app = Flask(__name__)
app.secret_key = 'your_secret_key'
login_manager = LoginManager()
login_manager.init_app(app)

class User(UserMixin):
    def __init__(self, id):
        self.id = id

# Example user store
users = {'user1': User(1)}

@login_manager.user_loader
def load_user(user_id):
    return users.get(user_id)

@app.route('/login', methods=['GET', 'POST'])
def login():
    if request.method == 'POST':
        user_id = request.form['user_id']
        user = users.get(user_id)
        if user:
            login_user(user)
            return redirect(url_for('dashboard'))
    return 'Login Page'

@app.route('/dashboard')
@login_required
def dashboard():
    return 'This is the dashboard. You are logged in.'

@app.route('/logout')
@login_required
def logout():
    logout_user()
    return redirect(url_for('login'))

if __name__ == '__main__':
    app.run()


 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m


In [None]:
import logging

# Set up logging
logging.basicConfig(filename='access.log', level=logging.INFO)

def access_sensitive_data(user_id):
    # Log the access attempt
    logging.info(f'User {user_id} accessed sensitive data')

# Example usage
access_sensitive_data('user1')


In [None]:
import base64

# Encrypt the entire DataFrame before storage
encrypted_data = df.applymap(lambda x: cipher_suite.encrypt(str(x).encode('utf-8')))

# Save to a file (e.g., CSV)
encrypted_data.to_csv('encrypted_data.csv', index=False)

# Optionally, encode the encryption key as well
encoded_key = base64.urlsafe_b64encode(key).decode('utf-8')
with open('encryption_key.txt', 'w') as f:
    f.write(encoded_key)

  encrypted_data = df.applymap(lambda x: cipher_suite.encrypt(str(x).encode('utf-8')))


In [None]:
from pyspark.sql import SparkSession

# Create a Spark session
spark = SparkSession.builder \
    .appName("Netflix Recommendation System") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

# Load dataset
file_path = "C:/Users/ravii/Downloads/netflix_titles.csv"
df = spark.read.csv(file_path, header=True, inferSchema=True)

# Display schema to ensure correct loading
df.printSchema()

# Perform some basic data processing
df_filtered = df.dropna()  # Drop rows with missing values
df_filtered = df_filtered.filter(df['rating'].isNotNull())  # Keep rows with a valid rating

# Show the first few rows of the processed DataFrame
df_filtered.show()

# Save processed data to a new file if needed
df_filtered.write.csv("processed_netflix_titles.csv", header=True)
