In [5]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
import string
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from textblob import TextBlob
import csv

# Try reading the CSV with different options
try:
    df = pd.read_csv('blogs.csv', error_bad_lines=False, warn_bad_lines=True)
except pd.errors.ParserError:
    # Read the CSV and skip rows with errors
    cleaned_data = []
    with open('blogs.csv', 'r') as file:
        reader = csv.reader(file)
        for row in reader:
            try:
                if len(row) == 2:  # Assuming there should be two columns
                    cleaned_data.append(row)
            except:
                continue

    # Convert to DataFrame
    df = pd.DataFrame(cleaned_data, columns=['Data', 'Labels'])

# Data exploration
df.info()
df.isnull().sum()

# Preprocess the data
nltk.download('stopwords')

def preprocess_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

df['Data'] = df['Data'].apply(preprocess_text)

# Feature extraction
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(df['Data'])
y = df['Labels']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Naive Bayes model
nb = MultinomialNB()
nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)

# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1-score: {f1}')

# Sentiment analysis
def get_sentiment(text):
    blob = TextBlob(text)
    if blob.sentiment.polarity > 0:
        return 'positive'
    elif blob.sentiment.polarity < 0:
        return 'negative'
    else:
        return 'neutral'

df['Sentiment'] = df['Data'].apply(get_sentiment)

# Sentiment distribution
sentiment_distribution = df.groupby('Labels')['Sentiment'].value_counts(normalize=True).unstack()
print(sentiment_distribution)




  df = pd.read_csv('blogs.csv', error_bad_lines=False, warn_bad_lines=True)


  df = pd.read_csv('blogs.csv', error_bad_lines=False, warn_bad_lines=True)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1990 entries, 0 to 1989
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Data    1990 non-null   object
 1   Labels  1990 non-null   object
dtypes: object(2)
memory usage: 31.2+ KB


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nikhi\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


Accuracy: 0.7814070351758794
Precision: 0.8439429230459362
Recall: 0.7814070351758794
F1-score: 0.7801675008861262
Sentiment                 negative  neutral  positive
Labels                                               
Labels                         NaN     1.00       NaN
alt.atheism               0.350000      NaN  0.650000
comp.graphics             0.270000      NaN  0.730000
comp.os.ms-windows.misc   0.240000      NaN  0.760000
comp.sys.ibm.pc.hardware  0.190000      NaN  0.810000
comp.sys.mac.hardware     0.260000      NaN  0.740000
comp.windows.x            0.200000     0.02  0.780000
misc.forsale              0.210000      NaN  0.790000
rec.autos                 0.240000      NaN  0.760000
rec.motorcycles           0.280000      NaN  0.720000
rec.sport.baseball        0.350000      NaN  0.650000
rec.sport.hockey          0.400000      NaN  0.600000
sci.crypt                 0.190000      NaN  0.810000
sci.electronics           0.250000      NaN  0.750000
sci.med              