In [None]:
import pandas as pd
import numpy as np
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from textblob import Word, TextBlob


In [None]:
df=pd.read_csv("/kaggle/input/phishing-email-dataset/CEAS_08.csv")

In [None]:
#Phishing (1) vs. Legitimate (0)

In [None]:
df.head()

In [None]:
df["sender"].value_counts()

In [None]:
df["sender_2"]=df['sender'].str.split('@').str[1]

In [None]:
df["sender_2"].value_counts()

In [None]:
nltk.download('vader_lexicon')

In [None]:
sia=SentimentIntensityAnalyzer()

In [None]:
sia.polarity_scores("It is terrible")["compound"]

In [None]:
def get_compound_score(text):
    return sia.polarity_scores(text)['compound']

In [None]:
def get_compound_score(text):
    return sia.polarity_scores(text)['compound']

df['subject'] = df['subject'].astype(str)
df['body'] = df['body'].astype(str)


In [None]:
df['Compound_Subject'] = df['subject'].apply(get_compound_score)


In [None]:
df['Compound_Body'] = df['body'].apply(get_compound_score)

In [None]:
df["Compound"]=df['Compound_Subject'].astype(float)+df['Compound_Body'].astype(float)

In [None]:
bins = [-2, -1, 0, 1,2]
labels = ['-2 to -1', '-1 to 0', '0 to 1', '1 to 2']

# Yeni etiketlenmiş sütunu oluşturun
df['Compound_Label'] = pd.cut(df['Compound'], bins=bins, labels=labels, include_lowest=True)

In [None]:
df.head()

In [None]:
df["subject"]=df["subject"].str.lower()

In [None]:
df["body"]=df["body"].str.lower()

In [None]:
df["Subject+Body"]=df["subject"]+" "+df["body"]

In [None]:
df["Subject+Body"]=df["Subject+Body"].str.replace('[^\w\s]','')

In [None]:
spam_words=pd.read_excel("/kaggle/input/spam-words/Spam_Words.xlsx")

In [None]:
spam_words["SPAM WORDS"]=spam_words["SPAM WORDS"].str.lower()

In [None]:
spam_words["SPAM WORDS"]=spam_words["SPAM WORDS"].str.replace('[^\w\s]','')

In [None]:
def count_spam_words(text, spam_words):
    text = text.lower()  
    count = 0
    for word in spam_words["SPAM WORDS"]:
        if word in text:
            count += 1
    return count

In [None]:
df["Spam_Count"]=df["Subject+Body"].apply(lambda x: count_spam_words(x, spam_words))

In [None]:
df["Text_Length"] = df["Subject+Body"].apply(lambda x: len(x))

In [None]:
df['edu_mail'] = df['sender'].apply(lambda x: 1 if 'edu' in x else 0)
df['python_mail'] = df['sender'].apply(lambda x: 1 if 'python' in x else 0)
df['apache_mail'] = df['sender'].apply(lambda x: 1 if 'apache' in x else 0)
df['loewis_mail'] = df['sender'].apply(lambda x: 1 if 'loewis' in x else 0)
df['gmail_mail'] = df['sender'].apply(lambda x: 1 if 'gmail' in x else 0)
df['org_mail'] = df['sender'].apply(lambda x: 1 if 'org' in x else 0)


In [None]:
df.head()

In [None]:
df_new=df[["urls","Compound_Subject","Spam_Count","Text_Length","edu_mail","python_mail","apache_mail","loewis_mail","gmail_mail","org_mail","label"]]

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

correlation_matrix = df_new.corr()

plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Korelasyon Matrisi Isı Haritası')
plt.show()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
TFIDV=TfidfVectorizer()



In [None]:
from scipy.sparse import hstack, csr_matrix

In [None]:
tfidf_vectorizer = TfidfVectorizer(max_features=10000, max_df=0.95, min_df=2)
tfidf_matrix = tfidf_vectorizer.fit_transform(df['Subject+Body'])

tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

sparse_features = csr_matrix(df[["urls", "Compound_Subject", "Spam_Count", "Text_Length", "edu_mail", "python_mail", "apache_mail", "loewis_mail", "gmail_mail", "org_mail"]].values)

X = hstack([sparse_features, tfidf_matrix])

In [None]:
y=df["label"]

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [None]:
model = LogisticRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Model Accuracy:", accuracy)

In [None]:
X_2=tfidf_df


In [None]:
X_2_train, X_2_test, y_train, y_test = train_test_split(X_2, y, test_size=0.3, random_state=42)


In [None]:
model = LogisticRegression()
model.fit(X_2_train, y_train)

y_pred = model.predict(X_2_test)

accuracy = accuracy_score(y_test, y_pred)
print("Model Accuracy:", accuracy)

In [None]:
X_3=df[["urls", "Compound_Subject", "Spam_Count", "Text_Length", "edu_mail", "python_mail", "apache_mail", "loewis_mail", "gmail_mail", "org_mail"]]

In [None]:
X_3_train, X_3_test, y_train, y_test = train_test_split(X_3, y, test_size=0.3, random_state=42)

In [None]:
#BEST OPTION IS X_2

In [None]:
model = LogisticRegression()
model.fit(X_3_train, y_train)

# Test verileri ile tahmin yapma
y_pred = model.predict(X_3_test)

# Doğruluk oranını hesaplama
accuracy = accuracy_score(y_test, y_pred)
print("Model Accuracy:", accuracy)

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
model = RandomForestClassifier()
model.fit(X_2_train, y_train)

y_pred = model.predict(X_2_test)

accuracy = accuracy_score(y_test, y_pred)
print("Model Accuracy:", accuracy)