In [3]:
import pandas as pd
from tabulate import tabulate
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression


file_path = 'Titanic.csv'  
titanic_data = pd.read_csv('Titanic.csv')


# Handle missing values
imputer = SimpleImputer(strategy='median')
titanic_data['age'] = imputer.fit_transform(titanic_data[['age']])


# Convert categorical variables to numerical
label_encoders = {}
for column in ['sex', 'embarked', 'class', 'who', 'alone']:
    label_encoders[column] = LabelEncoder()
    titanic_data[column] = label_encoders[column].fit_transform(titanic_data[column])


# Split the data into training and testing sets
X = titanic_data.drop('survived', axis=1)
y = titanic_data['survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Train a Random Forest classifier
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rf_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# Predict survival for all passengers
titanic_data['predicted_survived'] = rf_model.predict(X)

# Filter passengers that are predicted to survive
survived_passengers = titanic_data[titanic_data['predicted_survived'] == 1]

# Display the passengers predicted to survive
print('Passengers predicted to survive:')
print(tabulate(survived_passengers.head(20), headers='keys', tablefmt='pretty'))

Passengers predicted to survive:
+----+-----+------+-------+-------+----------+----------+-------+-----+-------+----------+--------------------+
|    | sex | age  | sibsp | parch |   fare   | embarked | class | who | alone | survived | predicted_survived |
+----+-----+------+-------+-------+----------+----------+-------+-----+-------+----------+--------------------+
| 1  | 0.0 | 38.0 |  1.0  |  0.0  | 71.2833  |   0.0    |  0.0  | 2.0 |  0.0  |   1.0    |        1.0         |
| 2  | 0.0 | 26.0 |  0.0  |  0.0  |  7.925   |   2.0    |  2.0  | 2.0 |  1.0  |   1.0    |        1.0         |
| 3  | 0.0 | 35.0 |  1.0  |  0.0  |   53.1   |   2.0    |  0.0  | 2.0 |  0.0  |   1.0    |        1.0         |
| 8  | 0.0 | 27.0 |  0.0  |  2.0  | 11.1333  |   2.0    |  2.0  | 2.0 |  0.0  |   1.0    |        1.0         |
| 9  | 0.0 | 14.0 |  1.0  |  0.0  | 30.0708  |   0.0    |  1.0  | 0.0 |  0.0  |   1.0    |        1.0         |
| 10 | 0.0 | 4.0  |  1.0  |  1.0  |   16.7   |   2.0    |  2.0  | 0.0 |

### Number Two

In [10]:
import pandas as pd

# Path to the dataset
dataset_path = 'spam_ham_dataset.csv'

# Read the CSV file into a DataFrame
emails_df = pd.read_csv(dataset_path)

# Check for missing values
print(emails_df.isnull().sum())

# Extract the text of the emails
email_texts = emails_df['text']

# Extract the labels
email_labels = emails_df['label']




from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer
tfidf_vect = TfidfVectorizer(stop_words='english', max_df=0.7)

# Fit and transform the email texts
email_tfidf = tfidf_vect.fit_transform(email_texts)



from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(email_tfidf, email_labels, test_size=0.2, random_state=42)



from sklearn.linear_model import LogisticRegression

# Initialize the Logistic Regression model
log_reg_model = LogisticRegression()

# Train the model on the training data
log_reg_model.fit(X_train, y_train)



from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_report

# Make predictions on the test set
y_pred = log_reg_model.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, pos_label='spam')
recall = recall_score(y_test, y_pred, pos_label='spam')
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# Print the classification report
print('Classification Report:')
print(class_report)

Unnamed: 0    0
label         0
text          0
label_num     0
dtype: int64
Classification Report:
              precision    recall  f1-score   support

         ham       1.00      0.99      0.99       742
        spam       0.97      0.99      0.98       293

    accuracy                           0.99      1035
   macro avg       0.98      0.99      0.99      1035
weighted avg       0.99      0.99      0.99      1035

