1. Data Ingestion Pipeline:

a. Design a data ingestion pipeline that collects and stores data from various sources such as databases, APIs, and streaming platforms.

b. Implement a real-time data ingestion pipeline for processing sensor data from IoT devices.

c. Develop a data ingestion pipeline that handles data from different file formats (CSV, JSON, etc.) and performs data validation and cleansing.


In [None]:
import requests
import json
import csv

# Collect data from a RESTful API
api_url = "https://api.example.com/data"
response = requests.get(api_url)
data = response.json()

# Store data in a database
# Assuming you have a database connection already established
db.insert(data)

# Collect data from a streaming platform (e.g., Apache Kafka)
from kafka import KafkaConsumer

consumer = KafkaConsumer('sensor_data_topic', bootstrap_servers='localhost:9092')
for message in consumer:
    data = json.loads(message.value)
    # Store data in a database or perform real-time processing

# Collect data from a database (SQL)
import sqlite3

conn = sqlite3.connect('data.db')
cursor = conn.cursor()

cursor.execute("SELECT * FROM sensor_data")
data = cursor.fetchall()


In [None]:
from kafka import KafkaConsumer

consumer = KafkaConsumer('sensor_data_topic', bootstrap_servers='localhost:9092')
for message in consumer:
    data = json.loads(message.value)

In [None]:
import csv
import json

# Handle CSV file
with open('data.csv', 'r') as csv_file:
    csv_reader = csv.reader(csv_file)
    for row in csv_reader:
        # Perform data validation and cleansing
        # Store or process the data further

# Handle JSON file
with open('data.json', 'r') as json_file:
    data = json.load(json_file)
    # Perform data validation and cleansing
    # Store or process the data further


2. Model Training:

a. Build a machine learning model to predict customer churn based on a given dataset. Train the model using appropriate algorithms and evaluate its performance.

b. Develop a model training pipeline that incorporates feature engineering techniques such as one-hot encoding, feature scaling, and dimensionality reduction.

c. Train a deep learning model for image classification using transfer learning and fine-tuning techniques.


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load dataset (assuming you have a CSV file 'churn_data.csv')
data = pd.read_csv('churn_data.csv')

# Split dataset into features and target variable
X = data.drop('churn', axis=1)
y = data['churn']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Model training (assuming you want to use logistic regression)
model = LogisticRegression()
model.fit(X_train_scaled, y_train)

# Model evaluation
y_pred = model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier

# Load dataset with features (X) and target variable (y)
# Assuming X contains both categorical and numerical features

# Define preprocessing steps
categorical_features = ['category1', 'category2']
categorical_transformer = OneHotEncoder()

numerical_features = ['numerical1', 'numerical2']
numerical_transformer = StandardScaler()

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features),
        ('num', numerical_transformer, numerical_features)
    ])

# Add dimensionality reduction step (PCA)
pca = PCA(n_components=10)

# Build the pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('dim_reduction', pca),
    ('classifier', RandomForestClassifier())
])

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the pipeline (preprocessing and model training)
pipeline.fit(X_train, y_train)

# Model evaluation
y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)


In [None]:
from tensorflow.keras.applications import VGG16
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Load pre-trained VGG16 model (assuming you have the weights stored)
base_model = VGG16(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

# Freeze the base model's layers
for layer in base_model.layers:
    layer.trainable = False

# Create the model architecture
model = Sequential()
model.add(base_model)
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer=Adam(lr=0.001), loss='binary_crossentropy', metrics=['accuracy'])

# Prepare image data generators (assuming you have train and validation directories)
train_datagen = ImageDataGenerator(rescale=1./255)
train_generator = train_datagen.flow_from_directory(
    'train_dir',
    target_size=(224, 224),
    batch_size=32,
    class_mode='binary')

val_datagen = ImageDataGenerator(rescale=1./255)
val_generator = val_datagen.flow_from_directory(
    'val_dir',
    target_size=(224, 224),
    batch_size=32,
    class_mode='binary')

# Train the model with transfer learning
history = model.fit(
    train_generator,
    steps_per_epoch=len(train_generator),
    epochs=10,
    validation_data=val_generator,
    validation_steps=len(val_generator))

# Fine-tuning (optional)
for layer in model.layers[:15]:
    layer.trainable = False
for layer in model.layers[15:]:
    layer.trainable = True

# Compile the model after fine-tuning
model.compile(optimizer=Adam(lr=0.0001), loss='binary_crossentropy', metrics=['accuracy'])

# Continue training with fine-tuning
history_finetune = model.fit(
    train_generator,
    steps_per_epoch=len(train_generator),
    epochs=5,
    validation_data=val_generator,
    validation_steps=len(val_generator))


3. Model Validation:

a. Implement cross-validation to evaluate the performance of a regression model for predicting housing prices.

b. Perform model validation using different evaluation metrics such as accuracy, precision, recall, and F1 score for a binary classification problem.

c. Design a model validation strategy that incorporates stratified sampling to handle imbalanced datasets.


In [None]:
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression

# Load dataset and split into features (X) and target variable (y)
# Assuming you have X and y

# Create a regression model
model = LinearRegression()

# Perform cross-validation (assuming you have X and y)
cv_scores = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error')

# Calculate root mean squared error (RMSE) from the cross-validation scores
rmse_scores = np.sqrt(-cv_scores)

# Print the RMSE scores for each fold
for fold, score in enumerate(rmse_scores):
    print(f"Fold {fold+1}: RMSE = {score}")

# Calculate the mean and standard deviation of the RMSE scores
mean_rmse = np.mean(rmse_scores)
std_rmse = np.std(rmse_scores)

print("Mean RMSE:", mean_rmse)
print("Standard Deviation of RMSE:", std_rmse)


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load dataset and split into features (X) and target variable (y)
# Assuming you have X and y for binary classification

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a classification model (assuming you want to use logistic regression)
model = LogisticRegression()

# Fit the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)


In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC

# Load dataset and split into features (X) and target variable (y)
# Assuming you have X and y for binary classification with imbalanced classes

# Create a classification model (assuming you want to use Support Vector Machine)
model = SVC()

# Create a stratified k-fold cross-validator
skf = StratifiedKFold(n_splits=5)

# Perform model validation with stratified sampling
accuracy_scores = []
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy_scores.append(accuracy_score(y_test, y_pred))

# Print the accuracy scores for each fold
for fold, score in enumerate(accuracy_scores):
    print(f"Fold {fold+1}: Accuracy = {score}")

# Calculate the mean and standard deviation of the accuracy scores
mean_accuracy = np.mean(accuracy_scores)
std_accuracy = np.std(accuracy_scores)

print("Mean Accuracy:", mean_accuracy)
print("Standard Deviation of Accuracy:", std_accuracy)


4. Deployment Strategy:

   a. Create a deployment strategy for a machine learning model that provides real-time recommendations based on user interactions.
   
   b. Develop a deployment pipeline that automates the process of deploying machine learning models to cloud platforms such as AWS or Azure.
   
   c. Design a monitoring and maintenance strategy for deployed models to ensure their performance and reliability over time.


In [None]:
# Import necessary libraries
from recommendation_model import RecommendationModel  # Assuming you have a recommendation model module
from user_interaction_stream import UserInteractionStream  # Assuming you have a user interaction stream module
from recommendation_service import RecommendationService  # Assuming you have a recommendation service module

# Load and initialize the recommendation model
model = RecommendationModel()
model.load_model('model_file.pkl')

# Set up a user interaction stream to capture real-time data
stream = UserInteractionStream()

# Initialize the recommendation service
service = RecommendationService(model)

# Start capturing and processing real-time user interactions
while True:
    user_interaction = stream.get_next_interaction()
    recommendation = service.get_recommendation(user_interaction)
    # Perform actions with the recommendation, such as displaying it to the user or storing it in a database


In [None]:
import os
import subprocess
import boto3

# Set AWS credentials and region
os.environ['AWS_ACCESS_KEY_ID'] = 'your_aws_access_key_id'
os.environ['AWS_SECRET_ACCESS_KEY'] = 'your_aws_secret_access_key'
os.environ['AWS_DEFAULT_REGION'] = 'your_aws_default_region'

# Build the model code
subprocess.call('python setup.py sdist', shell=True)

# Upload the model code to S3
s3_client = boto3.client('s3')
s3_client.upload_file('dist/model_code.tar.gz', 'your_s3_bucket', 'model_code.tar.gz')

# Create an AWS Lambda function
lambda_client = boto3.client('lambda')
lambda_client.create_function(
    FunctionName='your_function_name',
    Runtime='python3.8',
    Role='your_lambda_role_arn',
    Handler='handler_function_name',
    Code={
        'S3Bucket': 'your_s3_bucket',
        'S3Key': 'model_code.tar.gz',
    },
    Environment={
        'Variables': {
            'ENV_VAR_NAME': 'env_var_value',
            # Add other environment variables as required
        }
    },
    Timeout=60,
    MemorySize=512,
    Publish=True
)

# Create an API Gateway
api_client = boto3.client('apigateway')
api_client.create_rest_api(
    name='your_api_name',
    description='your_api_description'
    # Add other API Gateway configurations as required
)


In [None]:
import time
import logging

# Set up logging
logging.basicConfig(filename='model_logs.log', level=logging.INFO)

# Monitor the deployed model
while True:
    try:
        # Monitor performance metrics and log them
        performance_metrics = monitor_performance()
        logging.info(performance_metrics)

        # Check for data drift and trigger retraining if necessary
        if check_data_drift():
            retrain_model()

        # Sleep for a specific interval before monitoring again
        time.sleep(60)
    except Exception as e:
        # Log any errors or exceptions encountered during monitoring
        logging.error(str(e))
