In [1]:
import pandas as pd

# Load data without assuming the first row is column headers
df = pd.read_csv('all-data.csv', header=None, encoding='latin1')

# Print the first few rows and the columns to inspect the data
print(df.head())
print(df.columns)


          0                                                  1
0   neutral  According to Gran , the company has no plans t...
1   neutral  Technopolis plans to develop in stages an area...
2  negative  The international electronic industry company ...
3  positive  With the new production plant the company woul...
4  positive  According to the company 's updated strategy f...
Index([0, 1], dtype='int64')


In [2]:
# Rename the unnamed column to 'article' for clarity
df.columns = ['label', 'article']



# Check the output
print(df.head())


      label                                            article
0   neutral  According to Gran , the company has no plans t...
1   neutral  Technopolis plans to develop in stages an area...
2  negative  The international electronic industry company ...
3  positive  With the new production plant the company woul...
4  positive  According to the company 's updated strategy f...


In [3]:
print(df.columns)


Index(['label', 'article'], dtype='object')


In [4]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Load dataset
df = pd.read_csv('all-data.csv', encoding='latin1')

# Rename the unnamed column to 'article' for clarity
df.columns = ['label', 'article']  # 'label' for the first column, 'article' for the second column containing text

# Show the first few rows to verify the column names
print(df.head())

# Initialize NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Preprocessing function
def preprocess_text(text):
    # Remove non-alphabetic characters and convert to lower case
    text = re.sub(r'[^a-zA-Z\s]', '', text, re.I)
    text = text.lower()
    
    # Tokenization and stopword removal
    words = text.split()
    words = [word for word in words if word not in stop_words]
    
    # Lemmatization
    words = [lemmatizer.lemmatize(word) for word in words]
    
    return ' '.join(words)

# Apply preprocessing to the 'article' column
df['processed_text'] = df['article'].apply(preprocess_text)

# Verify that preprocessing worked
print(df.head())


      label                                            article
0   neutral  Technopolis plans to develop in stages an area...
1  negative  The international electronic industry company ...
2  positive  With the new production plant the company woul...
3  positive  According to the company 's updated strategy f...
4  positive  FINANCING OF ASPOCOMP 'S GROWTH Aspocomp is ag...


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


      label                                            article  \
0   neutral  Technopolis plans to develop in stages an area...   
1  negative  The international electronic industry company ...   
2  positive  With the new production plant the company woul...   
3  positive  According to the company 's updated strategy f...   
4  positive  FINANCING OF ASPOCOMP 'S GROWTH Aspocomp is ag...   

                                      processed_text  
0  technopolis plan develop stage area less 0,000...  
1  international electronic industry company elco...  
2  new production plant company would increase ca...  
3  according company updated strategy year 009-20...  
4  financing aspocomp growth aspocomp aggressivel...  


In [5]:
# Check how the labels are mapped
print(df['label'].value_counts())  # Check the distribution of sentiment labels

# Double-check the mapping if you haven't already
y = df['label'].map({'positive': 1, 'negative': 0, 'neutral': 2})

# Split dataset into features and labels
X = df['processed_text']


label
neutral     2878
positive    1363
negative     604
Name: count, dtype: int64


In [6]:
from sklearn.model_selection import train_test_split

# Split dataset into features and labels
X = df['processed_text']
y = df['label']  # Assuming 'sentiment' contains the labels (positive, negative, neutral)

# Convert the labels into numeric values (if not already done)
y = y.map({'positive': 1, 'negative': 0, 'neutral': 2})

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [7]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Tokenization
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(X_train)

# Convert text to sequences
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Padding sequences to ensure they are of the same length
max_sequence_length = 100  # You can adjust this based on your dataset
X_train_pad = pad_sequences(X_train_seq, maxlen=max_sequence_length)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_sequence_length)


In [8]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Embedding, Dense, Dropout
from tensorflow.keras.optimizers import Adam

# Define the LSTM model
model = Sequential()
model.add(Embedding(input_dim=10000, output_dim=128, input_length=max_sequence_length))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(3, activation='softmax'))  # 3 classes: positive, negative, neutral

# Compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(), metrics=['accuracy'])

# Train the model
history = model.fit(X_train_pad, y_train, epochs=5, batch_size=64, validation_data=(X_test_pad, y_test))


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [9]:
# Evaluate the model
test_loss, test_accuracy = model.evaluate(X_test_pad, y_test)
print(f"Test Accuracy: {test_accuracy:.4f}")


Test Accuracy: 0.7141


In [10]:
def predict_sentiment(news_text):
    # Preprocess the input text
    processed_text = preprocess_text(news_text)
    
    # Convert text to sequence and pad
    seq = tokenizer.texts_to_sequences([processed_text])
    padded_seq = pad_sequences(seq, maxlen=max_sequence_length)
    
    # Predict sentiment
    pred = model.predict(padded_seq)
    
    # Get the label with the highest probability
    sentiment_labels = ['negative', 'positive', 'neutral']
    sentiment = sentiment_labels[pred.argmax()]
    
    return sentiment

# Example usage
new_article = 'The international electronic industry company Elcoteq has laid off tens of employees from its Tallinn facility ; contrary to earlier layoffs the company contracted the ranks of its office workers , the daily Postimees reported .'
predicted_sentiment = predict_sentiment(new_article)
print(f"Predicted Sentiment: {predicted_sentiment}")


Predicted Sentiment: negative


In [11]:
df['article'][1]

'The international electronic industry company Elcoteq has laid off tens of employees from its Tallinn facility ; contrary to earlier layoffs the company contracted the ranks of its office workers , the daily Postimees reported .'

In [12]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [13]:
import nltk
nltk.data.find('tokenizers/punkt')


FileSystemPathPointer('C:\\Users\\DELL\\AppData\\Roaming\\nltk_data\\tokenizers\\punkt')